From f3a86d3616440bfc2bc4880f9f85efa42904ea1c Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Thu, 3 Mar 2022 20:10:43 -0800 Subject: [PATCH 01/17] added change for CILEN to support cuteSV --- src/mavis/convert/vcf.py | 22 ++++++++++++++++-- tests/test_mavis/convert/test_tools_vcf.py | 27 ++++++++++++++++++++++ 2 files changed, 47 insertions(+), 2 deletions(-) diff --git a/src/mavis/convert/vcf.py b/src/mavis/convert/vcf.py index c1c07faf..72aa5ddd 100644 --- a/src/mavis/convert/vcf.py +++ b/src/mavis/convert/vcf.py @@ -37,6 +37,7 @@ class VcfInfoType(TypedDict, total=False): CHR2: str CIPOS: Tuple[int, int] CIEND: Tuple[int, int] + CILEN: Tuple[int, int] CT: str END: Optional[int] PRECISE: bool @@ -192,6 +193,19 @@ def convert_record(record: VcfRecordType) -> List[Dict]: COLUMNS.break2_position_end: end, } ) + elif ( + info.get('CILEN') != None + ): # as per https://github.com/samtools/hts-specs/issues/615, CILEN takes priority over CIPOS in dictating break2 + std_row.update( + { + COLUMNS.break1_position_start: max( + 1, record.pos + info.get('CIPOS', (0, 0))[0] + ), + COLUMNS.break1_position_end: record.pos + info.get('CIPOS', (0, 0))[1], + COLUMNS.break2_position_start: max(1, end + info.get('CILEN', (0, 0))[0]), + COLUMNS.break2_position_end: end + info.get('CILEN', (0, 0))[1], + } + ) else: std_row.update( { @@ -220,7 +234,11 @@ def convert_record(record: VcfRecordType) -> List[Dict]: except KeyError: pass std_row.update( - {k: v for k, v in info.items() if k not in {'CHR2', 'SVTYPE', 'CIPOS', 'CIEND', 'CT'}} + { + k: v + for k, v in info.items() + if k not in {'CHR2', 'SVTYPE', 'CIPOS', 'CIEND', 'CILEN', 'CT'} + } ) records.append(std_row) return records @@ -238,7 +256,7 @@ def parse_info(info_field): # convert info types for key in info: - if key in {'CIPOS', 'CIEND'}: + if key in {'CIPOS', 'CIEND', 'CILEN'}: ci_start, ci_end = info[key].split(',') info[key] = (int(ci_start), int(ci_end)) elif key == 'END': diff --git a/tests/test_mavis/convert/test_tools_vcf.py b/tests/test_mavis/convert/test_tools_vcf.py index f846d41a..e59a48c3 100644 --- a/tests/test_mavis/convert/test_tools_vcf.py +++ b/tests/test_mavis/convert/test_tools_vcf.py @@ -59,3 +59,30 @@ def test_convert_record(): assert precise_records.get('break1_chromosome') == 'chr14_KI270722v1_random' assert imprecise_records.get('break1_chromosome') == 'chr14_KI270722v1_random' + + +def test_convert_record_cuteSV(): + variant_cilen = VcfRecordType( + id='vcf-cuteSV.INS', + pos=1853407, + chrom='chr5', + alts=[ + 'AGGATCTATGTGGCTGTTGCAGGGTGACCCGAGGTCACGAGAGGCAAGGTCAGAGGACGATGTGAGGGCTGCAGGGTGACCCGAGGTCACGTAGGGCAAGGTCAGAGGACGATGTGGCGGTTGCAGGGAGACCCAGGTCACGCAGGCAAGGTCAGAGGACGATGTGAGGGAGTTGCAGGGTGACCCGAGGTCACGTAGGGCAAGGTCAGAGGACGATGTGGCGGTTGCAGGGTGACCCGAGGTCA' + ], + ref='A', + info=VcfInfoType( + CHR2="chr5", + IMPRECISE=True, + SVMETHOD="cuteSV-1.0.12", + SVTYPE="INS", + SUPTYPE="None", + STRANDS="None", + CIPOS=(-30, 30), + CILEN=(-65, 65), + ), + ) + variant_ins_cutesv = convert_record(variant_cilen) + assert len(variant_ins_cutesv) == 1 + variant_ins_cutesv = variant_ins_cutesv[0] + assert variant_ins_cutesv.get('break2_position_start') == 1853342 + assert variant_ins_cutesv.get('break2_position_end') == 1853472 From 608b3cd6a7278856e51e7fe5330608fe556cacb2 Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Fri, 4 Mar 2022 13:03:10 -0800 Subject: [PATCH 02/17] new checks implemented to prevent bp sorting in edge cases --- src/mavis/breakpoint.py | 16 ++++++++++------ src/mavis/summary/summary.py | 4 +++- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/src/mavis/breakpoint.py b/src/mavis/breakpoint.py index ac3b188f..7ea5245a 100644 --- a/src/mavis/breakpoint.py +++ b/src/mavis/breakpoint.py @@ -259,11 +259,14 @@ def __init__( >>> BreakpointPair(Breakpoint('1', 1), Breakpoint('1', 9999), opposing_strands=True) >>> BreakpointPair(Breakpoint('1', 1, strand='+'), Breakpoint('1', 9999, strand='-')) """ - - if b1.key[:3] > b2.key[:3]: - self.break1 = b2 - self.break2 = b1 - else: + if b1.chr != b2.chr or not Interval.overlaps(b1, b2): + if b1.key[:3] > b2.key[:3]: + self.break1 = b2 + self.break2 = b1 + else: + self.break1 = b1 + self.break2 = b2 + else: # handles case whereby overlapping calls should skip sorting https://github.com/bcgsc/mavis/issues/319 self.break1 = b1 self.break2 = b2 self.stranded = stranded @@ -310,11 +313,12 @@ def column(self, colname: str): return self.data.get(COLUMNS[colname]) def __str__(self): - return 'BPP({}, {}{}{})'.format( + return 'BPP({}, {}{}{}{})'.format( str(self.break1), str(self.break2), ', opposing={}'.format(self.opposing_strands) if not self.stranded else '', ', seq=' + repr(self.untemplated_seq) if self.untemplated_seq is not None else '', + ', tracking_id=' + self.tracking_id if self.tracking_id else '', ) def flatten(self): diff --git a/src/mavis/summary/summary.py b/src/mavis/summary/summary.py index f30089b4..4c022e97 100644 --- a/src/mavis/summary/summary.py +++ b/src/mavis/summary/summary.py @@ -129,7 +129,9 @@ def group_events(events): bpp.break2.strand != new_bpp.break2.strand, ] ): - raise AssertionError('cannot group events differing on key elements', bpp, new_bpp) + raise AssertionError( + 'cannot group events differing on key elements', str(bpp), str(new_bpp) + ) # Note: There are some attributes that shouldn't be lost if different, currently appending the information # The evidence could be better off as a max instead of a join From 165fc6439ac7df77783da050cd0227a5971198d1 Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Fri, 4 Mar 2022 15:19:24 -0800 Subject: [PATCH 03/17] added file check prior to convertion --- src/mavis/convert/vcf.py | 11 ++++++++-- tests/test_mavis/convert/test_tools_vcf.py | 25 ++++++++++++++++++++++ 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/src/mavis/convert/vcf.py b/src/mavis/convert/vcf.py index 72aa5ddd..1ce63ff5 100644 --- a/src/mavis/convert/vcf.py +++ b/src/mavis/convert/vcf.py @@ -195,7 +195,7 @@ def convert_record(record: VcfRecordType) -> List[Dict]: ) elif ( info.get('CILEN') != None - ): # as per https://github.com/samtools/hts-specs/issues/615, CILEN takes priority over CIPOS in dictating break2 + ): # as per https://github.com/samtools/hts-specs/issues/615, CILEN takes priority over CIPOS in dictating breakpoint2 std_row.update( { COLUMNS.break1_position_start: max( @@ -206,6 +206,10 @@ def convert_record(record: VcfRecordType) -> List[Dict]: COLUMNS.break2_position_end: end + info.get('CILEN', (0, 0))[1], } ) + if std_row.get("break1_position_end") > std_row.get( + "break2_position_end" + ): # truncate breakpoint1 to max lenght of breakpoint2 + std_row.update({COLUMNS.break1_position_end: std_row.get("break2_position_end")}) else: std_row.update( { @@ -217,6 +221,10 @@ def convert_record(record: VcfRecordType) -> List[Dict]: COLUMNS.break2_position_end: end + info.get('CIEND', (0, 0))[1], } ) + if std_row.get("break1_position_end") > std_row.get( + "break2_position_end" + ): # truncate breakpoint1 to max lenght of breakpoint2 + std_row.update({COLUMNS.break1_position_end: std_row.get("break2_position_end")}) if std_row['break1_position_end'] == 0 and std_row['break1_position_start'] == 1: # addresses cases where pos = 0 and telomeric BND alt syntax https://github.com/bcgsc/mavis/issues/294 std_row.update({'break1_position_end': 1}) @@ -225,7 +233,6 @@ def convert_record(record: VcfRecordType) -> List[Dict]: if 'SVTYPE' in info: std_row[COLUMNS.event_type] = info['SVTYPE'] - try: orient1, orient2 = info['CT'].split('to') connection_type = {'3': ORIENT.LEFT, '5': ORIENT.RIGHT, 'N': ORIENT.NS} diff --git a/tests/test_mavis/convert/test_tools_vcf.py b/tests/test_mavis/convert/test_tools_vcf.py index e59a48c3..7cdd9631 100644 --- a/tests/test_mavis/convert/test_tools_vcf.py +++ b/tests/test_mavis/convert/test_tools_vcf.py @@ -86,3 +86,28 @@ def test_convert_record_cuteSV(): variant_ins_cutesv = variant_ins_cutesv[0] assert variant_ins_cutesv.get('break2_position_start') == 1853342 assert variant_ins_cutesv.get('break2_position_end') == 1853472 + + variant_cilen2 = VcfRecordType( + id='vcf-cuteSV.INS', + pos=1853407, + chrom='chr5', + alts=[ + 'AGGATCTATGTGGCTGTTGCAGGGTGACCCGAGGTCACGAGAGGCAAGGTCAGAGGACGATGTGAGGGCTGCAGGGTGACCCGAGGTCACGTAGGGCAAGGTCAGAGGACGATGTGGCGGTTGCAGGGAGACCCAGGTCACGCAGGCAAGGTCAGAGGACGATGTGAGGGAGTTGCAGGGTGACCCGAGGTCACGTAGGGCAAGGTCAGAGGACGATGTGGCGGTTGCAGGGTGACCCGAGGTCA' + ], + ref='A', + info=VcfInfoType( + CHR2="chr5", + IMPRECISE=True, + SVMETHOD="cuteSV-1.0.12", + SVTYPE="INS", + SUPTYPE="None", + STRANDS="None", + CIPOS=(-30, 9999), + CILEN=(-65, 65), + ), + ) + variant_ins_cutesv_2 = convert_record(variant_cilen2) + assert len(variant_ins_cutesv_2) == 1 + variant_ins_cutesv_2 = variant_ins_cutesv_2[0] + assert variant_ins_cutesv_2.get('break2_position_end') == 1853472 + assert variant_ins_cutesv_2.get('break1_position_end') == 1853472 From 7051fc9b9cbcc8b44b680d0de7a29c78096e2513 Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Mon, 7 Mar 2022 15:15:59 -0800 Subject: [PATCH 04/17] quick bugfixes of INS errors --- src/mavis/breakpoint.py | 25 ++-------- src/mavis/convert/vcf.py | 57 ++++++++++++++++++---- tests/test_mavis/convert/test_tools_vcf.py | 32 ++++++++++-- 3 files changed, 81 insertions(+), 33 deletions(-) diff --git a/src/mavis/breakpoint.py b/src/mavis/breakpoint.py index 7ea5245a..695f2416 100644 --- a/src/mavis/breakpoint.py +++ b/src/mavis/breakpoint.py @@ -39,7 +39,6 @@ def __init__( orient (ORIENT): the orientation (which side is retained at the break) strand (STRAND): the strand seq: the seq - Examples: >>> Breakpoint('1', 1, 2) >>> Breakpoint('1', 1) @@ -251,22 +250,17 @@ def __init__( opposing_strands: are the strands at the breakpoint opposite? i.e. +/- instead of +/+ untemplated_seq: seq between the breakpoints that is not part of either breakpoint data: optional dictionary of attributes associated with this pair - Note: untemplated_seq should always be given wrt to the positive/forward reference strand - Example: >>> BreakpointPair(Breakpoint('1', 1), Breakpoint('1', 9999), opposing_strands=True) >>> BreakpointPair(Breakpoint('1', 1, strand='+'), Breakpoint('1', 9999, strand='-')) """ - if b1.chr != b2.chr or not Interval.overlaps(b1, b2): - if b1.key[:3] > b2.key[:3]: - self.break1 = b2 - self.break2 = b1 - else: - self.break1 = b1 - self.break2 = b2 - else: # handles case whereby overlapping calls should skip sorting https://github.com/bcgsc/mavis/issues/319 + + if b1.key[:3] > b2.key[:3]: + self.break1 = b2 + self.break2 = b1 + else: self.break1 = b1 self.break2 = b2 self.stranded = stranded @@ -355,13 +349,11 @@ def classify(cls, pair, distance: Optional[Callable] = None) -> Set[str]: """ uses the chr, orientations and strands to determine the possible structural_variant types that this pair could support - Args: pair (BreakpointPair): the pair to classify distance: if defined, will be passed to net size to use in narrowing the list of putative types (del vs ins) Returns: a list of possible SVTYPE - Example: >>> bpp = BreakpointPair(Breakpoint('1', 1), Breakpoint('1', 9999), opposing_strands=True) >>> BreakpointPair.classify(bpp) @@ -369,7 +361,6 @@ def classify(cls, pair, distance: Optional[Callable] = None) -> Set[str]: >>> bpp = BreakpointPair(Breakpoint('1', 1, orient='L'), Breakpoint('1', 9999, orient='R'), opposing_strands=False) >>> BreakpointPair.classify(bpp) {'deletion', 'insertion'} - Note: see [related theory documentation](/background/theory/#classifying-events) """ @@ -446,22 +437,16 @@ def breakpoint_sequence_homology(self, reference_genome: ReferenceGenome): this sequence comparison is done with reference to a reference genome and does not use novel or untemplated sequence in the comparison. For this reason, insertions will never return any homologous sequence - - small duplication event CTT => CTTCTT - GATACATTTCTTCTTGAAAA reference ---------<========== first breakpoint ===========>-------- second breakpoint ---------CT-CT------ first break homology -------TT-TT-------- second break homology - Args: reference_genome: dict of reference sequence by template/chr name - Returns: Tuple[str,str]: homologous sequence at the first breakpoint and second breakpoints - Raises: AttributeError: for non specific breakpoints """ diff --git a/src/mavis/convert/vcf.py b/src/mavis/convert/vcf.py index 1ce63ff5..683addc2 100644 --- a/src/mavis/convert/vcf.py +++ b/src/mavis/convert/vcf.py @@ -123,6 +123,46 @@ def parse_bnd_alt(alt: str) -> Tuple[str, int, str, str, str, str]: raise NotImplementedError('alt specification in unexpected format', alt) +def handle_edge_case(std_row: Dict) -> Dict: + """ + Checks for known errors caused by callers in IMPRECISE calls, that leveraged uncertainty from the CIPOS/CIEND/CILEN fields. + + bp1_s = breakpoint1 start + bp1_e = breakpoint1 end + bp2_s = breakpoint2 start + bp2_e = breakpoint2 end + + Insertion edge cases include: + Scenario 1: bp1_s = 1890, bp1_e = 2000, bp2_s = 1900, bp2_e = 1900. bp1_e > bp2_s + Scenario 2: bp1_s = 1890, bp1_e = 2000, bp2_s = 1800, bp2_e = 2000. bp1_e > bp1_s + Scenario 3: bp1_s = 1950, bp1_e = 2000, bp2_s = 1900, bp2_e = 3000. bp1_s > bp2_s + """ + + try: + if ( + std_row['break1_position_end'] > std_row['break2_position_end'] + and std_row["break1_chromosome"] == std_row["break2_chromosome"] + and std_row["event_type"] == 'INS' + ): # Scenario 1 + std_row.update({'break1_position_end': std_row['break2_position_end']}) + if ( + std_row['break1_position_start'] > std_row['break2_position_end'] + and std_row["break1_chromosome"] == std_row["break2_chromosome"] + and std_row["event_type"] == 'INS' + ): # Scenario 2 + std_row.update({'break1_position_start': std_row['break2_position_end']}) + if ( + std_row['break1_position_start'] > std_row['break2_position_start'] + and std_row["break1_chromosome"] == std_row["break2_chromosome"] + and std_row["event_type"] == 'INS' + ): # Scenario 2 + std_row.update({'break2_position_start': std_row['break1_position_start']}) + return std_row + + except KeyError as err: + logger.warn(f'Missing required column inputs: {err}') + + def convert_record(record: VcfRecordType) -> List[Dict]: """ converts a vcf record @@ -182,6 +222,10 @@ def convert_record(record: VcfRecordType) -> List[Dict]: elif size < 0: std_row[COLUMNS.event_type] = SVTYPE.DEL std_row.update({COLUMNS.break1_chromosome: record.chrom, COLUMNS.break2_chromosome: chr2}) + + if 'SVTYPE' in info: + std_row[COLUMNS.event_type] = info['SVTYPE'] + if info.get( 'PRECISE', False ): # DELLY CI only apply when split reads were not used to refine the breakpoint which is then flagged @@ -206,10 +250,7 @@ def convert_record(record: VcfRecordType) -> List[Dict]: COLUMNS.break2_position_end: end + info.get('CILEN', (0, 0))[1], } ) - if std_row.get("break1_position_end") > std_row.get( - "break2_position_end" - ): # truncate breakpoint1 to max lenght of breakpoint2 - std_row.update({COLUMNS.break1_position_end: std_row.get("break2_position_end")}) + handle_edge_case(std_row) else: std_row.update( { @@ -221,18 +262,14 @@ def convert_record(record: VcfRecordType) -> List[Dict]: COLUMNS.break2_position_end: end + info.get('CIEND', (0, 0))[1], } ) - if std_row.get("break1_position_end") > std_row.get( - "break2_position_end" - ): # truncate breakpoint1 to max lenght of breakpoint2 - std_row.update({COLUMNS.break1_position_end: std_row.get("break2_position_end")}) + handle_edge_case(std_row) + if std_row['break1_position_end'] == 0 and std_row['break1_position_start'] == 1: # addresses cases where pos = 0 and telomeric BND alt syntax https://github.com/bcgsc/mavis/issues/294 std_row.update({'break1_position_end': 1}) if std_row['break2_position_end'] == 0 and std_row['break2_position_start'] == 1: std_row.update({'break2_position_end': 1}) - if 'SVTYPE' in info: - std_row[COLUMNS.event_type] = info['SVTYPE'] try: orient1, orient2 = info['CT'].split('to') connection_type = {'3': ORIENT.LEFT, '5': ORIENT.RIGHT, 'N': ORIENT.NS} diff --git a/tests/test_mavis/convert/test_tools_vcf.py b/tests/test_mavis/convert/test_tools_vcf.py index 7cdd9631..563f8d2c 100644 --- a/tests/test_mavis/convert/test_tools_vcf.py +++ b/tests/test_mavis/convert/test_tools_vcf.py @@ -10,7 +10,7 @@ def test_read_vcf(): assert df.shape[0] == 106 -def test_convert_record(): +def test_convert_record_sniffle(): variant_imprecise = VcfRecordType( id='mock-BND-imprecise', pos=0, @@ -60,6 +60,30 @@ def test_convert_record(): assert precise_records.get('break1_chromosome') == 'chr14_KI270722v1_random' assert imprecise_records.get('break1_chromosome') == 'chr14_KI270722v1_random' + variant_cilen3 = VcfRecordType( + id='Sniffle.INS', + pos=11184, + chrom='chr2', + alts=[ + 'AGGAGGCGCACCGGGGCACCGCGCAGGCGCAGAGAGGCGCACCGCGCCCGCGCAGGCGCAGAGAGGCGCACCGCGCCCGCGCAGGCGGCGAGAGAGGCGCGACCGCGCCCGCGTAGGCGCAGAGAGGCGCACCGCGCCCCGCGCAGGCGCAGGCGCACCCGCGCCCGCGCAGGCGCAGAGAGGCGTGACCCGCGCCCGCGCAGG' + ], + ref='N', + info=VcfInfoType( + CHR2="chr2", + IMPRECISE=True, + SVMETHOD="Snifflesv1.0.12", + SVTYPE="INS", + SUPTYPE="AL", + STRANDS="+-", + END=11183, + ), + ) + variant_ins_cutesv_3 = convert_record(variant_cilen3) + assert len(variant_ins_cutesv_3) == 1 + variant_ins_cutesv_3 = variant_ins_cutesv_3[0] + assert variant_ins_cutesv_3.get('break2_position_end') == 11183 + assert variant_ins_cutesv_3.get('break1_position_end') == 11183 + def test_convert_record_cuteSV(): variant_cilen = VcfRecordType( @@ -84,7 +108,7 @@ def test_convert_record_cuteSV(): variant_ins_cutesv = convert_record(variant_cilen) assert len(variant_ins_cutesv) == 1 variant_ins_cutesv = variant_ins_cutesv[0] - assert variant_ins_cutesv.get('break2_position_start') == 1853342 + assert variant_ins_cutesv.get('break2_position_start') == 1853377 assert variant_ins_cutesv.get('break2_position_end') == 1853472 variant_cilen2 = VcfRecordType( @@ -109,5 +133,7 @@ def test_convert_record_cuteSV(): variant_ins_cutesv_2 = convert_record(variant_cilen2) assert len(variant_ins_cutesv_2) == 1 variant_ins_cutesv_2 = variant_ins_cutesv_2[0] - assert variant_ins_cutesv_2.get('break2_position_end') == 1853472 + assert variant_ins_cutesv_2.get('break1_position_start') == 1853377 assert variant_ins_cutesv_2.get('break1_position_end') == 1853472 + assert variant_ins_cutesv_2.get('break2_position_end') == 1853472 + assert variant_ins_cutesv_2.get('break2_position_start') == 1853377 From 81ff13c2a5bd712caba99e7e9e430bf32b6fbae7 Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Mon, 7 Mar 2022 16:27:41 -0800 Subject: [PATCH 05/17] added to description and logic --- src/mavis/breakpoint.py | 9 ++++++ src/mavis/convert/vcf.py | 66 +++++++++++++++++++++++----------------- 2 files changed, 47 insertions(+), 28 deletions(-) diff --git a/src/mavis/breakpoint.py b/src/mavis/breakpoint.py index 695f2416..b3010357 100644 --- a/src/mavis/breakpoint.py +++ b/src/mavis/breakpoint.py @@ -349,11 +349,13 @@ def classify(cls, pair, distance: Optional[Callable] = None) -> Set[str]: """ uses the chr, orientations and strands to determine the possible structural_variant types that this pair could support + Args: pair (BreakpointPair): the pair to classify distance: if defined, will be passed to net size to use in narrowing the list of putative types (del vs ins) Returns: a list of possible SVTYPE + Example: >>> bpp = BreakpointPair(Breakpoint('1', 1), Breakpoint('1', 9999), opposing_strands=True) >>> BreakpointPair.classify(bpp) @@ -361,6 +363,7 @@ def classify(cls, pair, distance: Optional[Callable] = None) -> Set[str]: >>> bpp = BreakpointPair(Breakpoint('1', 1, orient='L'), Breakpoint('1', 9999, orient='R'), opposing_strands=False) >>> BreakpointPair.classify(bpp) {'deletion', 'insertion'} + Note: see [related theory documentation](/background/theory/#classifying-events) """ @@ -437,16 +440,22 @@ def breakpoint_sequence_homology(self, reference_genome: ReferenceGenome): this sequence comparison is done with reference to a reference genome and does not use novel or untemplated sequence in the comparison. For this reason, insertions will never return any homologous sequence + + small duplication event CTT => CTTCTT + GATACATTTCTTCTTGAAAA reference ---------<========== first breakpoint ===========>-------- second breakpoint ---------CT-CT------ first break homology -------TT-TT-------- second break homology + Args: reference_genome: dict of reference sequence by template/chr name + Returns: Tuple[str,str]: homologous sequence at the first breakpoint and second breakpoints + Raises: AttributeError: for non specific breakpoints """ diff --git a/src/mavis/convert/vcf.py b/src/mavis/convert/vcf.py index 683addc2..92066112 100644 --- a/src/mavis/convert/vcf.py +++ b/src/mavis/convert/vcf.py @@ -123,7 +123,7 @@ def parse_bnd_alt(alt: str) -> Tuple[str, int, str, str, str, str]: raise NotImplementedError('alt specification in unexpected format', alt) -def handle_edge_case(std_row: Dict) -> Dict: +def convert_imprecise_break(std_row: Dict, info: Dict) -> Dict: """ Checks for known errors caused by callers in IMPRECISE calls, that leveraged uncertainty from the CIPOS/CIEND/CILEN fields. @@ -133,31 +133,43 @@ def handle_edge_case(std_row: Dict) -> Dict: bp2_e = breakpoint2 end Insertion edge cases include: - Scenario 1: bp1_s = 1890, bp1_e = 2000, bp2_s = 1900, bp2_e = 1900. bp1_e > bp2_s - Scenario 2: bp1_s = 1890, bp1_e = 2000, bp2_s = 1800, bp2_e = 2000. bp1_e > bp1_s - Scenario 3: bp1_s = 1950, bp1_e = 2000, bp2_s = 1900, bp2_e = 3000. bp1_s > bp2_s + Scenario 1 - in which bp1_e > bp2_s + E.g bp1_s = 1890, bp1_e = 2000, bp2_s = 1900, bp2_e = 1900. + break1 ------------------------=======================-------------- + break2 ------------------------==========--------------------------- + + Scenario 2 - in which bp1_e > bp1_s + E.g bp1_s = 1890, bp1_e = 1800, bp2_s = 1800, bp2_e = 1800. + break1 ------------------------==----------------------------------- + break2 ------------------------=------------------------------------ + + Scenario 3 - in which bp1_s > bp2_s + E.g bp1_s = 1950, bp1_e = 2000, bp2_s = 1900, bp2_e = 3000. + break1 ------------------------==----------------------------------- + break2 -----------------------========------------------------------ """ try: - if ( - std_row['break1_position_end'] > std_row['break2_position_end'] - and std_row["break1_chromosome"] == std_row["break2_chromosome"] - and std_row["event_type"] == 'INS' - ): # Scenario 1 - std_row.update({'break1_position_end': std_row['break2_position_end']}) - if ( - std_row['break1_position_start'] > std_row['break2_position_end'] - and std_row["break1_chromosome"] == std_row["break2_chromosome"] - and std_row["event_type"] == 'INS' - ): # Scenario 2 - std_row.update({'break1_position_start': std_row['break2_position_end']}) - if ( - std_row['break1_position_start'] > std_row['break2_position_start'] - and std_row["break1_chromosome"] == std_row["break2_chromosome"] - and std_row["event_type"] == 'INS' - ): # Scenario 2 - std_row.update({'break2_position_start': std_row['break1_position_start']}) - return std_row + if info.get('IMPRECISE', True): + if ( + std_row['break1_position_end'] > std_row['break2_position_end'] + and std_row["break1_chromosome"] == std_row["break2_chromosome"] + and std_row["event_type"] == 'INS' + ): # bp1_e > bp2_s + std_row.update({'break1_position_end': std_row['break2_position_end']}) + if ( + std_row['break1_position_start'] > std_row['break2_position_end'] + and std_row["break1_chromosome"] == std_row["break2_chromosome"] + and std_row["event_type"] == 'INS' + ): # bp1_e > bp1_s + std_row.update({'break1_position_start': std_row['break2_position_end']}) + if ( + std_row['break1_position_start'] > std_row['break2_position_start'] + and std_row["break1_chromosome"] == std_row["break2_chromosome"] + and std_row["event_type"] == 'INS' + ): # bp1_s > bp2_s + std_row.update({'break2_position_start': std_row['break1_position_start']}) + return std_row except KeyError as err: logger.warn(f'Missing required column inputs: {err}') @@ -237,8 +249,8 @@ def convert_record(record: VcfRecordType) -> List[Dict]: COLUMNS.break2_position_end: end, } ) - elif ( - info.get('CILEN') != None + elif info.get( + 'CILEN' ): # as per https://github.com/samtools/hts-specs/issues/615, CILEN takes priority over CIPOS in dictating breakpoint2 std_row.update( { @@ -250,7 +262,6 @@ def convert_record(record: VcfRecordType) -> List[Dict]: COLUMNS.break2_position_end: end + info.get('CILEN', (0, 0))[1], } ) - handle_edge_case(std_row) else: std_row.update( { @@ -262,8 +273,7 @@ def convert_record(record: VcfRecordType) -> List[Dict]: COLUMNS.break2_position_end: end + info.get('CIEND', (0, 0))[1], } ) - handle_edge_case(std_row) - + convert_imprecise_break(std_row, info) if std_row['break1_position_end'] == 0 and std_row['break1_position_start'] == 1: # addresses cases where pos = 0 and telomeric BND alt syntax https://github.com/bcgsc/mavis/issues/294 std_row.update({'break1_position_end': 1}) From 42585abf95c3370f45826b62746be44af66e076c Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Wed, 9 Mar 2022 21:34:03 -0800 Subject: [PATCH 06/17] imprecise calls handling put into one function --- src/mavis/convert/vcf.py | 105 ++++++++++++++++++++------------------- 1 file changed, 55 insertions(+), 50 deletions(-) diff --git a/src/mavis/convert/vcf.py b/src/mavis/convert/vcf.py index 92066112..d5c0f9b2 100644 --- a/src/mavis/convert/vcf.py +++ b/src/mavis/convert/vcf.py @@ -123,53 +123,80 @@ def parse_bnd_alt(alt: str) -> Tuple[str, int, str, str, str, str]: raise NotImplementedError('alt specification in unexpected format', alt) -def convert_imprecise_break(std_row: Dict, info: Dict) -> Dict: +def convert_imprecise_break(std_row: Dict, info: Dict, record: List) -> Dict: """ - Checks for known errors caused by callers in IMPRECISE calls, that leveraged uncertainty from the CIPOS/CIEND/CILEN fields. + Handles IMPRECISE calls, that leveraged uncertainty from the CIPOS/CIEND/CILEN fields. bp1_s = breakpoint1 start bp1_e = breakpoint1 end bp2_s = breakpoint2 start bp2_e = breakpoint2 end - Insertion edge cases include: - Scenario 1 - in which bp1_e > bp2_s + Insertion and deletion edge case - in which bp1_e > bp2_s E.g bp1_s = 1890, bp1_e = 2000, bp2_s = 1900, bp2_e = 1900. break1 ------------------------=======================-------------- break2 ------------------------==========--------------------------- - Scenario 2 - in which bp1_e > bp1_s + Insertion edge case - in which bp1_e > bp1_s E.g bp1_s = 1890, bp1_e = 1800, bp2_s = 1800, bp2_e = 1800. break1 ------------------------==----------------------------------- break2 ------------------------=------------------------------------ - Scenario 3 - in which bp1_s > bp2_s + Insertion edge case - in which bp1_s > bp2_s E.g bp1_s = 1950, bp1_e = 2000, bp2_s = 1900, bp2_e = 3000. break1 ------------------------==----------------------------------- break2 -----------------------========------------------------------ """ try: - if info.get('IMPRECISE', True): - if ( - std_row['break1_position_end'] > std_row['break2_position_end'] - and std_row["break1_chromosome"] == std_row["break2_chromosome"] - and std_row["event_type"] == 'INS' - ): # bp1_e > bp2_s - std_row.update({'break1_position_end': std_row['break2_position_end']}) - if ( - std_row['break1_position_start'] > std_row['break2_position_end'] - and std_row["break1_chromosome"] == std_row["break2_chromosome"] - and std_row["event_type"] == 'INS' - ): # bp1_e > bp1_s - std_row.update({'break1_position_start': std_row['break2_position_end']}) - if ( - std_row['break1_position_start'] > std_row['break2_position_start'] - and std_row["break1_chromosome"] == std_row["break2_chromosome"] - and std_row["event_type"] == 'INS' - ): # bp1_s > bp2_s - std_row.update({'break2_position_start': std_row['break1_position_start']}) - return std_row + if info.get( + 'CILEN' + ): # as per https://github.com/samtools/hts-specs/issues/615, CILEN takes priority over CIPOS in dictating breakpoint2 + std_row.update( + { + COLUMNS.break1_position_start: max( + 1, record.pos + info.get('CIPOS', (0, 0))[0] + ), + COLUMNS.break1_position_end: record.pos + info.get('CIPOS', (0, 0))[1], + COLUMNS.break2_position_start: max( + 1, record.stop + info.get('CILEN', (0, 0))[0] + ), + COLUMNS.break2_position_end: record.stop + info.get('CILEN', (0, 0))[1], + } + ) + else: + std_row.update( + { + COLUMNS.break1_position_start: max( + 1, record.pos + info.get('CIPOS', (0, 0))[0] + ), + COLUMNS.break1_position_end: record.pos + info.get('CIPOS', (0, 0))[1], + COLUMNS.break2_position_start: max( + 1, record.stop + info.get('CIEND', (0, 0))[0] + ), + COLUMNS.break2_position_end: record.stop + info.get('CIEND', (0, 0))[1], + } + ) + + if ( + std_row['break1_position_end'] > std_row['break2_position_end'] + and std_row["break1_chromosome"] == std_row["break2_chromosome"] + and (std_row["event_type"] == 'INS') + ): # bp1_e > bp2_e + std_row.update({'break1_position_end': std_row['break2_position_end']}) + if ( + std_row['break1_position_start'] > std_row['break2_position_end'] + and std_row["break1_chromosome"] == std_row["break2_chromosome"] + and (std_row["event_type"] == 'INS') + ): # bp1_s > bp2_e + std_row.update({'break1_position_start': std_row['break2_position_end']}) + if ( + std_row['break1_position_start'] > std_row['break2_position_start'] + and std_row["break1_chromosome"] == std_row["break2_chromosome"] + and (std_row["event_type"] == 'INS') + ): # bp1_s > bp2_s + std_row.update({'break2_position_start': std_row['break1_position_start']}) + return std_row except KeyError as err: logger.warn(f'Missing required column inputs: {err}') @@ -249,31 +276,9 @@ def convert_record(record: VcfRecordType) -> List[Dict]: COLUMNS.break2_position_end: end, } ) - elif info.get( - 'CILEN' - ): # as per https://github.com/samtools/hts-specs/issues/615, CILEN takes priority over CIPOS in dictating breakpoint2 - std_row.update( - { - COLUMNS.break1_position_start: max( - 1, record.pos + info.get('CIPOS', (0, 0))[0] - ), - COLUMNS.break1_position_end: record.pos + info.get('CIPOS', (0, 0))[1], - COLUMNS.break2_position_start: max(1, end + info.get('CILEN', (0, 0))[0]), - COLUMNS.break2_position_end: end + info.get('CILEN', (0, 0))[1], - } - ) else: - std_row.update( - { - COLUMNS.break1_position_start: max( - 1, record.pos + info.get('CIPOS', (0, 0))[0] - ), - COLUMNS.break1_position_end: record.pos + info.get('CIPOS', (0, 0))[1], - COLUMNS.break2_position_start: max(1, end + info.get('CIEND', (0, 0))[0]), - COLUMNS.break2_position_end: end + info.get('CIEND', (0, 0))[1], - } - ) - convert_imprecise_break(std_row, info) + convert_imprecise_break(std_row, info, record) + if std_row['break1_position_end'] == 0 and std_row['break1_position_start'] == 1: # addresses cases where pos = 0 and telomeric BND alt syntax https://github.com/bcgsc/mavis/issues/294 std_row.update({'break1_position_end': 1}) From 32f7bec264c5bb49367ae2bba942ad5be6665977 Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Mon, 14 Mar 2022 15:07:57 -0700 Subject: [PATCH 07/17] fixes for ins handling --- src/mavis/breakpoint.py | 3 +++ src/mavis/convert/vcf.py | 24 +++++++++++------------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/src/mavis/breakpoint.py b/src/mavis/breakpoint.py index b3010357..6d29fa4b 100644 --- a/src/mavis/breakpoint.py +++ b/src/mavis/breakpoint.py @@ -39,6 +39,7 @@ def __init__( orient (ORIENT): the orientation (which side is retained at the break) strand (STRAND): the strand seq: the seq + Examples: >>> Breakpoint('1', 1, 2) >>> Breakpoint('1', 1) @@ -250,8 +251,10 @@ def __init__( opposing_strands: are the strands at the breakpoint opposite? i.e. +/- instead of +/+ untemplated_seq: seq between the breakpoints that is not part of either breakpoint data: optional dictionary of attributes associated with this pair + Note: untemplated_seq should always be given wrt to the positive/forward reference strand + Example: >>> BreakpointPair(Breakpoint('1', 1), Breakpoint('1', 9999), opposing_strands=True) >>> BreakpointPair(Breakpoint('1', 1, strand='+'), Breakpoint('1', 9999, strand='-')) diff --git a/src/mavis/convert/vcf.py b/src/mavis/convert/vcf.py index d5c0f9b2..8e723edb 100644 --- a/src/mavis/convert/vcf.py +++ b/src/mavis/convert/vcf.py @@ -123,7 +123,9 @@ def parse_bnd_alt(alt: str) -> Tuple[str, int, str, str, str, str]: raise NotImplementedError('alt specification in unexpected format', alt) -def convert_imprecise_break(std_row: Dict, info: Dict, record: List) -> Dict: +def convert_imprecise_break( + std_row: Dict, info: Dict, record: List[VcfRecordType], bp_end: int +) -> Dict: """ Handles IMPRECISE calls, that leveraged uncertainty from the CIPOS/CIEND/CILEN fields. @@ -158,10 +160,8 @@ def convert_imprecise_break(std_row: Dict, info: Dict, record: List) -> Dict: 1, record.pos + info.get('CIPOS', (0, 0))[0] ), COLUMNS.break1_position_end: record.pos + info.get('CIPOS', (0, 0))[1], - COLUMNS.break2_position_start: max( - 1, record.stop + info.get('CILEN', (0, 0))[0] - ), - COLUMNS.break2_position_end: record.stop + info.get('CILEN', (0, 0))[1], + COLUMNS.break2_position_start: max(1, bp_end + info.get('CILEN', (0, 0))[0]), + COLUMNS.break2_position_end: bp_end + info.get('CILEN', (0, 0))[1], } ) else: @@ -171,29 +171,27 @@ def convert_imprecise_break(std_row: Dict, info: Dict, record: List) -> Dict: 1, record.pos + info.get('CIPOS', (0, 0))[0] ), COLUMNS.break1_position_end: record.pos + info.get('CIPOS', (0, 0))[1], - COLUMNS.break2_position_start: max( - 1, record.stop + info.get('CIEND', (0, 0))[0] - ), - COLUMNS.break2_position_end: record.stop + info.get('CIEND', (0, 0))[1], + COLUMNS.break2_position_start: max(1, bp_end + info.get('CIEND', (0, 0))[0]), + COLUMNS.break2_position_end: bp_end + info.get('CIEND', (0, 0))[1], } ) if ( std_row['break1_position_end'] > std_row['break2_position_end'] and std_row["break1_chromosome"] == std_row["break2_chromosome"] - and (std_row["event_type"] == 'INS') + and std_row["event_type"] == 'INS' ): # bp1_e > bp2_e std_row.update({'break1_position_end': std_row['break2_position_end']}) if ( std_row['break1_position_start'] > std_row['break2_position_end'] and std_row["break1_chromosome"] == std_row["break2_chromosome"] - and (std_row["event_type"] == 'INS') + and std_row["event_type"] == 'INS' ): # bp1_s > bp2_e std_row.update({'break1_position_start': std_row['break2_position_end']}) if ( std_row['break1_position_start'] > std_row['break2_position_start'] and std_row["break1_chromosome"] == std_row["break2_chromosome"] - and (std_row["event_type"] == 'INS') + and std_row["event_type"] == 'INS' ): # bp1_s > bp2_s std_row.update({'break2_position_start': std_row['break1_position_start']}) return std_row @@ -277,7 +275,7 @@ def convert_record(record: VcfRecordType) -> List[Dict]: } ) else: - convert_imprecise_break(std_row, info, record) + convert_imprecise_break(std_row, info, record, end) if std_row['break1_position_end'] == 0 and std_row['break1_position_start'] == 1: # addresses cases where pos = 0 and telomeric BND alt syntax https://github.com/bcgsc/mavis/issues/294 From ef2dfeb0240d1559279ae285e784a5db3206a089 Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Mon, 14 Mar 2022 17:49:44 -0700 Subject: [PATCH 08/17] removed info signature --- src/mavis/convert/vcf.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/src/mavis/convert/vcf.py b/src/mavis/convert/vcf.py index 8e723edb..20ebd8b1 100644 --- a/src/mavis/convert/vcf.py +++ b/src/mavis/convert/vcf.py @@ -123,9 +123,7 @@ def parse_bnd_alt(alt: str) -> Tuple[str, int, str, str, str, str]: raise NotImplementedError('alt specification in unexpected format', alt) -def convert_imprecise_break( - std_row: Dict, info: Dict, record: List[VcfRecordType], bp_end: int -) -> Dict: +def convert_imprecise_break(std_row: Dict, record: List[VcfRecordType], bp_end: int) -> Dict: """ Handles IMPRECISE calls, that leveraged uncertainty from the CIPOS/CIEND/CILEN fields. @@ -151,28 +149,32 @@ def convert_imprecise_break( """ try: - if info.get( + if record.info.get( 'CILEN' ): # as per https://github.com/samtools/hts-specs/issues/615, CILEN takes priority over CIPOS in dictating breakpoint2 std_row.update( { COLUMNS.break1_position_start: max( - 1, record.pos + info.get('CIPOS', (0, 0))[0] + 1, record.pos + record.info.get('CIPOS', (0, 0))[0] + ), + COLUMNS.break1_position_end: record.pos + record.info.get('CIPOS', (0, 0))[1], + COLUMNS.break2_position_start: max( + 1, bp_end + record.info.get('CILEN', (0, 0))[0] ), - COLUMNS.break1_position_end: record.pos + info.get('CIPOS', (0, 0))[1], - COLUMNS.break2_position_start: max(1, bp_end + info.get('CILEN', (0, 0))[0]), - COLUMNS.break2_position_end: bp_end + info.get('CILEN', (0, 0))[1], + COLUMNS.break2_position_end: bp_end + record.info.get('CILEN', (0, 0))[1], } ) else: std_row.update( { COLUMNS.break1_position_start: max( - 1, record.pos + info.get('CIPOS', (0, 0))[0] + 1, record.pos + record.info.get('CIPOS', (0, 0))[0] + ), + COLUMNS.break1_position_end: record.pos + record.info.get('CIPOS', (0, 0))[1], + COLUMNS.break2_position_start: max( + 1, bp_end + record.info.get('CIEND', (0, 0))[0] ), - COLUMNS.break1_position_end: record.pos + info.get('CIPOS', (0, 0))[1], - COLUMNS.break2_position_start: max(1, bp_end + info.get('CIEND', (0, 0))[0]), - COLUMNS.break2_position_end: bp_end + info.get('CIEND', (0, 0))[1], + COLUMNS.break2_position_end: bp_end + record.info.get('CIEND', (0, 0))[1], } ) @@ -275,7 +277,7 @@ def convert_record(record: VcfRecordType) -> List[Dict]: } ) else: - convert_imprecise_break(std_row, info, record, end) + convert_imprecise_break(std_row, record, end) if std_row['break1_position_end'] == 0 and std_row['break1_position_start'] == 1: # addresses cases where pos = 0 and telomeric BND alt syntax https://github.com/bcgsc/mavis/issues/294 From 939f56323c1b0d4a5c992bb708500f6e3cf96684 Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Tue, 29 Mar 2022 18:25:48 -0700 Subject: [PATCH 09/17] shorten code --- src/mavis/convert/vcf.py | 98 ++++++++++++++++++---------------------- 1 file changed, 45 insertions(+), 53 deletions(-) diff --git a/src/mavis/convert/vcf.py b/src/mavis/convert/vcf.py index 20ebd8b1..7b6eb53b 100644 --- a/src/mavis/convert/vcf.py +++ b/src/mavis/convert/vcf.py @@ -3,7 +3,7 @@ import re from dataclasses import dataclass from typing import Dict, List, Optional, Tuple - +from ..interval import Interval import pandas as pd try: @@ -123,7 +123,7 @@ def parse_bnd_alt(alt: str) -> Tuple[str, int, str, str, str, str]: raise NotImplementedError('alt specification in unexpected format', alt) -def convert_imprecise_break(std_row: Dict, record: List[VcfRecordType], bp_end: int) -> Dict: +def convert_imprecise_breakend(std_row: Dict, record: List[VcfRecordType], bp_end: int) -> Dict: """ Handles IMPRECISE calls, that leveraged uncertainty from the CIPOS/CIEND/CILEN fields. @@ -148,58 +148,50 @@ def convert_imprecise_break(std_row: Dict, record: List[VcfRecordType], bp_end: break2 -----------------------========------------------------------ """ - try: - if record.info.get( - 'CILEN' - ): # as per https://github.com/samtools/hts-specs/issues/615, CILEN takes priority over CIPOS in dictating breakpoint2 - std_row.update( - { - COLUMNS.break1_position_start: max( - 1, record.pos + record.info.get('CIPOS', (0, 0))[0] - ), - COLUMNS.break1_position_end: record.pos + record.info.get('CIPOS', (0, 0))[1], - COLUMNS.break2_position_start: max( - 1, bp_end + record.info.get('CILEN', (0, 0))[0] - ), - COLUMNS.break2_position_end: bp_end + record.info.get('CILEN', (0, 0))[1], - } + if record.info.get( + 'CILEN' + ): # as per https://github.com/samtools/hts-specs/issues/615, CILEN takes priority over CIPOS in dictating breakpoint2 + std_row.update( + { + COLUMNS.break1_position_start: max( + 1, record.pos + record.info.get('CIPOS', (0, 0))[0] + ), + COLUMNS.break1_position_end: record.pos + record.info.get('CIPOS', (0, 0))[1], + COLUMNS.break2_position_start: max(1, bp_end + record.info.get('CILEN', (0, 0))[0]), + COLUMNS.break2_position_end: bp_end + record.info.get('CILEN', (0, 0))[1], + } + ) + else: + std_row.update( + { + COLUMNS.break1_position_start: max( + 1, record.pos + record.info.get('CIPOS', (0, 0))[0] + ), + COLUMNS.break1_position_end: record.pos + record.info.get('CIPOS', (0, 0))[1], + COLUMNS.break2_position_start: max(1, bp_end + record.info.get('CIEND', (0, 0))[0]), + COLUMNS.break2_position_end: bp_end + record.info.get('CIEND', (0, 0))[1], + } + ) + + if std_row["break1_chromosome"] == std_row["break2_chromosome"] and ( + Interval.overlaps( + (std_row['break1_position_start'], std_row['break1_position_end']), + (std_row['break2_position_start'], std_row['break2_position_end']), + ) + or std_row['break1_position_start'] > std_row['break2_position_start'] + ): + if 'event_type' in std_row and std_row['event_type'] != 'BND': + std_row['break2_position_start'] = max( + std_row['break1_position_start'], std_row['break2_position_start'] ) - else: - std_row.update( - { - COLUMNS.break1_position_start: max( - 1, record.pos + record.info.get('CIPOS', (0, 0))[0] - ), - COLUMNS.break1_position_end: record.pos + record.info.get('CIPOS', (0, 0))[1], - COLUMNS.break2_position_start: max( - 1, bp_end + record.info.get('CIEND', (0, 0))[0] - ), - COLUMNS.break2_position_end: bp_end + record.info.get('CIEND', (0, 0))[1], - } + std_row['break1_position_end'] = min( + std_row['break1_position_end'], std_row['break2_position_end'] + ) + std_row['break1_position_start'] = min( + std_row['break1_position_start'], std_row['break2_position_end'] ) - if ( - std_row['break1_position_end'] > std_row['break2_position_end'] - and std_row["break1_chromosome"] == std_row["break2_chromosome"] - and std_row["event_type"] == 'INS' - ): # bp1_e > bp2_e - std_row.update({'break1_position_end': std_row['break2_position_end']}) - if ( - std_row['break1_position_start'] > std_row['break2_position_end'] - and std_row["break1_chromosome"] == std_row["break2_chromosome"] - and std_row["event_type"] == 'INS' - ): # bp1_s > bp2_e - std_row.update({'break1_position_start': std_row['break2_position_end']}) - if ( - std_row['break1_position_start'] > std_row['break2_position_start'] - and std_row["break1_chromosome"] == std_row["break2_chromosome"] - and std_row["event_type"] == 'INS' - ): # bp1_s > bp2_s - std_row.update({'break2_position_start': std_row['break1_position_start']}) - return std_row - - except KeyError as err: - logger.warn(f'Missing required column inputs: {err}') + return std_row def convert_record(record: VcfRecordType) -> List[Dict]: @@ -277,7 +269,7 @@ def convert_record(record: VcfRecordType) -> List[Dict]: } ) else: - convert_imprecise_break(std_row, record, end) + convert_imprecise_breakend(std_row, record, end) if std_row['break1_position_end'] == 0 and std_row['break1_position_start'] == 1: # addresses cases where pos = 0 and telomeric BND alt syntax https://github.com/bcgsc/mavis/issues/294 @@ -324,7 +316,7 @@ def parse_info(info_field): return info df['info'] = df['INFO'].apply(parse_info) - df['alts'] = df['ALT'].apply(lambda a: a.split(',')) + df['alts'] = df['ALT'].apply(lambda a: str(a).split(',')) rows = [] for _, row in df.iterrows(): From e76c27f20d1068a8a2c57540fb97664aaf4dca35 Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Tue, 19 Apr 2022 11:02:45 -0700 Subject: [PATCH 10/17] changed naming and updated breakend case in vcf conversion --- src/mavis/convert/vcf.py | 8 +- tests/test_mavis/convert/test_tools_vcf.py | 106 ++++++++++++--------- 2 files changed, 66 insertions(+), 48 deletions(-) diff --git a/src/mavis/convert/vcf.py b/src/mavis/convert/vcf.py index 7b6eb53b..3a5f1c35 100644 --- a/src/mavis/convert/vcf.py +++ b/src/mavis/convert/vcf.py @@ -123,7 +123,7 @@ def parse_bnd_alt(alt: str) -> Tuple[str, int, str, str, str, str]: raise NotImplementedError('alt specification in unexpected format', alt) -def convert_imprecise_breakend(std_row: Dict, record: List[VcfRecordType], bp_end: int) -> Dict: +def convert_imprecise_breakend(std_row: Dict, record: List[VcfRecordType], bp_end: int): """ Handles IMPRECISE calls, that leveraged uncertainty from the CIPOS/CIEND/CILEN fields. @@ -179,6 +179,7 @@ def convert_imprecise_breakend(std_row: Dict, record: List[VcfRecordType], bp_en (std_row['break2_position_start'], std_row['break2_position_end']), ) or std_row['break1_position_start'] > std_row['break2_position_start'] + or std_row['break2_position_start'] > std_row['break2_position_end'] ): if 'event_type' in std_row and std_row['event_type'] != 'BND': std_row['break2_position_start'] = max( @@ -190,8 +191,9 @@ def convert_imprecise_breakend(std_row: Dict, record: List[VcfRecordType], bp_en std_row['break1_position_start'] = min( std_row['break1_position_start'], std_row['break2_position_end'] ) - - return std_row + std_row['break2_position_start'] = min( + std_row['break2_position_start'], std_row['break2_position_end'] + ) def convert_record(record: VcfRecordType) -> List[Dict]: diff --git a/tests/test_mavis/convert/test_tools_vcf.py b/tests/test_mavis/convert/test_tools_vcf.py index 563f8d2c..aa822ed3 100644 --- a/tests/test_mavis/convert/test_tools_vcf.py +++ b/tests/test_mavis/convert/test_tools_vcf.py @@ -10,7 +10,7 @@ def test_read_vcf(): assert df.shape[0] == 106 -def test_convert_record_sniffle(): +def test_convert_telomeric_region(): variant_imprecise = VcfRecordType( id='mock-BND-imprecise', pos=0, @@ -60,39 +60,36 @@ def test_convert_record_sniffle(): assert precise_records.get('break1_chromosome') == 'chr14_KI270722v1_random' assert imprecise_records.get('break1_chromosome') == 'chr14_KI270722v1_random' - variant_cilen3 = VcfRecordType( - id='Sniffle.INS', - pos=11184, - chrom='chr2', - alts=[ - 'AGGAGGCGCACCGGGGCACCGCGCAGGCGCAGAGAGGCGCACCGCGCCCGCGCAGGCGCAGAGAGGCGCACCGCGCCCGCGCAGGCGGCGAGAGAGGCGCGACCGCGCCCGCGTAGGCGCAGAGAGGCGCACCGCGCCCCGCGCAGGCGCAGGCGCACCCGCGCCCGCGCAGGCGCAGAGAGGCGTGACCCGCGCCCGCGCAGG' - ], - ref='N', + +def test_convert_intrachromosomal_imprecise_breakend(): + # breakpoint_2_start < breakpoint_1_start + variant_imprecise = VcfRecordType( + id='vcf-cuteSV.INS', + pos=1853407, + chrom='chr5', + alts=['AGG'], + ref='A', info=VcfInfoType( - CHR2="chr2", + CHR2="chr5", IMPRECISE=True, - SVMETHOD="Snifflesv1.0.12", + SVMETHOD="cuteSV-1.0.12", SVTYPE="INS", - SUPTYPE="AL", - STRANDS="+-", - END=11183, + CIPOS=(-30, 30), + CILEN=(-65, 65), ), ) - variant_ins_cutesv_3 = convert_record(variant_cilen3) - assert len(variant_ins_cutesv_3) == 1 - variant_ins_cutesv_3 = variant_ins_cutesv_3[0] - assert variant_ins_cutesv_3.get('break2_position_end') == 11183 - assert variant_ins_cutesv_3.get('break1_position_end') == 11183 - + variant_ins_imprecise = convert_record(variant_imprecise) + assert len(variant_ins_imprecise) == 1 + variant_ins_imprecise = variant_ins_imprecise[0] + assert variant_ins_imprecise.get('break2_position_start') == 1853377 + assert variant_ins_imprecise.get('break2_position_end') == 1853472 -def test_convert_record_cuteSV(): - variant_cilen = VcfRecordType( + # breakpoint_1_end > breakpoint_2_end + variant_imprecise2 = VcfRecordType( id='vcf-cuteSV.INS', pos=1853407, chrom='chr5', - alts=[ - 'AGGATCTATGTGGCTGTTGCAGGGTGACCCGAGGTCACGAGAGGCAAGGTCAGAGGACGATGTGAGGGCTGCAGGGTGACCCGAGGTCACGTAGGGCAAGGTCAGAGGACGATGTGGCGGTTGCAGGGAGACCCAGGTCACGCAGGCAAGGTCAGAGGACGATGTGAGGGAGTTGCAGGGTGACCCGAGGTCACGTAGGGCAAGGTCAGAGGACGATGTGGCGGTTGCAGGGTGACCCGAGGTCA' - ], + alts=['AGG'], ref='A', info=VcfInfoType( CHR2="chr5", @@ -101,39 +98,58 @@ def test_convert_record_cuteSV(): SVTYPE="INS", SUPTYPE="None", STRANDS="None", - CIPOS=(-30, 30), + CIPOS=(-30, 9999), CILEN=(-65, 65), ), ) - variant_ins_cutesv = convert_record(variant_cilen) - assert len(variant_ins_cutesv) == 1 - variant_ins_cutesv = variant_ins_cutesv[0] - assert variant_ins_cutesv.get('break2_position_start') == 1853377 - assert variant_ins_cutesv.get('break2_position_end') == 1853472 + variant_ins_imprecise_2 = convert_record(variant_imprecise2) + assert len(variant_ins_imprecise_2) == 1 + variant_ins_imprecise_2 = variant_ins_imprecise_2[0] + assert variant_ins_imprecise_2.get('break1_position_start') == 1853377 + assert variant_ins_imprecise_2.get('break1_position_end') == 1853472 + assert variant_ins_imprecise_2.get('break2_position_end') == 1853472 + assert variant_ins_imprecise_2.get('break2_position_start') == 1853377 - variant_cilen2 = VcfRecordType( - id='vcf-cuteSV.INS', + # breakpoint_2_start > breakpoint_2_end + variant_imprecise3 = VcfRecordType( + id='mock-INS-imprecise', pos=1853407, chrom='chr5', - alts=[ - 'AGGATCTATGTGGCTGTTGCAGGGTGACCCGAGGTCACGAGAGGCAAGGTCAGAGGACGATGTGAGGGCTGCAGGGTGACCCGAGGTCACGTAGGGCAAGGTCAGAGGACGATGTGGCGGTTGCAGGGAGACCCAGGTCACGCAGGCAAGGTCAGAGGACGATGTGAGGGAGTTGCAGGGTGACCCGAGGTCACGTAGGGCAAGGTCAGAGGACGATGTGGCGGTTGCAGGGTGACCCGAGGTCA' - ], + alts=['AGG'], ref='A', info=VcfInfoType( CHR2="chr5", IMPRECISE=True, - SVMETHOD="cuteSV-1.0.12", + SVMETHOD="Snifflesv1.0.11", SVTYPE="INS", SUPTYPE="None", STRANDS="None", CIPOS=(-30, 9999), - CILEN=(-65, 65), + CILEN=(70, 65), + ), + ) + variant_ins_imprecise_3 = convert_record(variant_imprecise3) + assert len(variant_ins_imprecise_3) == 1 + variant_ins_imprecise_3 = variant_ins_imprecise_3[0] + assert variant_ins_imprecise_3.get('break2_position_end') == 1853472 + assert variant_ins_imprecise_3.get('break2_position_start') == 1853472 + + # breakpoint_1_start > breakpoint_1_end + variant_cilen4 = VcfRecordType( + id='Sniffle.INS', + pos=11184, + chrom='chr2', + alts=['AGG'], + ref='N', + info=VcfInfoType( + CHR2="chr2", + IMPRECISE=True, + SVTYPE="INS", + END=11183, ), ) - variant_ins_cutesv_2 = convert_record(variant_cilen2) - assert len(variant_ins_cutesv_2) == 1 - variant_ins_cutesv_2 = variant_ins_cutesv_2[0] - assert variant_ins_cutesv_2.get('break1_position_start') == 1853377 - assert variant_ins_cutesv_2.get('break1_position_end') == 1853472 - assert variant_ins_cutesv_2.get('break2_position_end') == 1853472 - assert variant_ins_cutesv_2.get('break2_position_start') == 1853377 + variant_ins_imprecise_4 = convert_record(variant_cilen4) + assert len(variant_ins_imprecise_4) == 1 + variant_ins_imprecise_4 = variant_ins_imprecise_4[0] + assert variant_ins_imprecise_4.get('break2_position_end') == 11183 + assert variant_ins_imprecise_4.get('break1_position_end') == 11183 From de14f6a5db6df631c6dc0d437ff0e077c44f4030 Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Tue, 19 Apr 2022 11:16:14 -0700 Subject: [PATCH 11/17] added warning to improper formatting --- src/mavis/convert/vcf.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/mavis/convert/vcf.py b/src/mavis/convert/vcf.py index 3a5f1c35..434353af 100644 --- a/src/mavis/convert/vcf.py +++ b/src/mavis/convert/vcf.py @@ -179,7 +179,6 @@ def convert_imprecise_breakend(std_row: Dict, record: List[VcfRecordType], bp_en (std_row['break2_position_start'], std_row['break2_position_end']), ) or std_row['break1_position_start'] > std_row['break2_position_start'] - or std_row['break2_position_start'] > std_row['break2_position_end'] ): if 'event_type' in std_row and std_row['event_type'] != 'BND': std_row['break2_position_start'] = max( @@ -195,6 +194,18 @@ def convert_imprecise_breakend(std_row: Dict, record: List[VcfRecordType], bp_en std_row['break2_position_start'], std_row['break2_position_end'] ) + elif ( + std_row["break1_chromosome"] == std_row["break2_chromosome"] + and std_row['break2_position_start'] > std_row['break2_position_end'] + ): + if 'event_type' in std_row and std_row['event_type'] != 'BND': + std_row['break2_position_start'] = min( + std_row['break2_position_start'], std_row['break2_position_end'] + ) + logger.warning( + f'Improper entry. Breakpoint2 start: {std_row["break2_position_start"]} is greater than Breakpoint2 end: {std_row["break2_position_end"]}\n Breakpoint2 start position coerced to breakpoint2 end position.' + ) + def convert_record(record: VcfRecordType) -> List[Dict]: """ From 4e6f5dd9844f855b3fa13b4ca10eb9a9819a59b1 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Fri, 22 Apr 2022 14:36:07 -0700 Subject: [PATCH 12/17] Clean up tests to split into separate cases --- tests/test_mavis/convert/test_tools_vcf.py | 106 +++++++++------------ 1 file changed, 47 insertions(+), 59 deletions(-) diff --git a/tests/test_mavis/convert/test_tools_vcf.py b/tests/test_mavis/convert/test_tools_vcf.py index aa822ed3..605508e9 100644 --- a/tests/test_mavis/convert/test_tools_vcf.py +++ b/tests/test_mavis/convert/test_tools_vcf.py @@ -1,3 +1,4 @@ +import pytest from mavis.convert import SUPPORTED_TOOL, _convert_tool_row from mavis.convert.vcf import VcfInfoType, VcfRecordType, convert_record, pandas_vcf @@ -61,33 +62,44 @@ def test_convert_telomeric_region(): assert imprecise_records.get('break1_chromosome') == 'chr14_KI270722v1_random' -def test_convert_intrachromosomal_imprecise_breakend(): - # breakpoint_2_start < breakpoint_1_start - variant_imprecise = VcfRecordType( - id='vcf-cuteSV.INS', - pos=1853407, - chrom='chr5', - alts=['AGG'], - ref='A', - info=VcfInfoType( - CHR2="chr5", - IMPRECISE=True, - SVMETHOD="cuteSV-1.0.12", - SVTYPE="INS", - CIPOS=(-30, 30), - CILEN=(-65, 65), - ), - ) - variant_ins_imprecise = convert_record(variant_imprecise) - assert len(variant_ins_imprecise) == 1 - variant_ins_imprecise = variant_ins_imprecise[0] - assert variant_ins_imprecise.get('break2_position_start') == 1853377 - assert variant_ins_imprecise.get('break2_position_end') == 1853472 +TEST_POS = 1853407 - # breakpoint_1_end > breakpoint_2_end - variant_imprecise2 = VcfRecordType( + +@pytest.mark.parametrize( + 'pos,break1_ci,break2_ci,break1,break2', + [ + [ + TEST_POS, + (-30, 30), + (-65, 65), + (TEST_POS - 30, TEST_POS + 30), + (TEST_POS - 30, TEST_POS + 65), + ], + [ + TEST_POS, + (-30, 99999), + (-10, 65), + (TEST_POS - 30, TEST_POS + 65), + (TEST_POS - 10, TEST_POS + 65), + ], + [ + TEST_POS, + (-30, 99999), + (70, 65), + (TEST_POS - 30, TEST_POS + 65), + (TEST_POS + 65, TEST_POS + 65), + ], + ], + ids=[ + 'breakpoint_2_start < breakpoint_1_start', + 'breakpoint_1_end > breakpoint_2_end', + 'breakpoint_2_start > breakpoint_2_end', + ], +) +def test_convert_intrachromosomal_imprecise_breakend(pos, break1_ci, break2_ci, break1, break2): + variant_vcf = VcfRecordType( id='vcf-cuteSV.INS', - pos=1853407, + pos=pos, chrom='chr5', alts=['AGG'], ref='A', @@ -96,44 +108,20 @@ def test_convert_intrachromosomal_imprecise_breakend(): IMPRECISE=True, SVMETHOD="cuteSV-1.0.12", SVTYPE="INS", - SUPTYPE="None", - STRANDS="None", - CIPOS=(-30, 9999), - CILEN=(-65, 65), + CIPOS=break1_ci, + CILEN=break2_ci, ), ) - variant_ins_imprecise_2 = convert_record(variant_imprecise2) - assert len(variant_ins_imprecise_2) == 1 - variant_ins_imprecise_2 = variant_ins_imprecise_2[0] - assert variant_ins_imprecise_2.get('break1_position_start') == 1853377 - assert variant_ins_imprecise_2.get('break1_position_end') == 1853472 - assert variant_ins_imprecise_2.get('break2_position_end') == 1853472 - assert variant_ins_imprecise_2.get('break2_position_start') == 1853377 + result = convert_record(variant_vcf) + assert len(result) == 1 + variant = result[0] + assert variant.get('break1_position_start') == break1[0] + assert variant.get('break1_position_end') == break1[1] + assert variant.get('break2_position_start') == break2[0] + assert variant.get('break2_position_end') == break2[1] - # breakpoint_2_start > breakpoint_2_end - variant_imprecise3 = VcfRecordType( - id='mock-INS-imprecise', - pos=1853407, - chrom='chr5', - alts=['AGG'], - ref='A', - info=VcfInfoType( - CHR2="chr5", - IMPRECISE=True, - SVMETHOD="Snifflesv1.0.11", - SVTYPE="INS", - SUPTYPE="None", - STRANDS="None", - CIPOS=(-30, 9999), - CILEN=(70, 65), - ), - ) - variant_ins_imprecise_3 = convert_record(variant_imprecise3) - assert len(variant_ins_imprecise_3) == 1 - variant_ins_imprecise_3 = variant_ins_imprecise_3[0] - assert variant_ins_imprecise_3.get('break2_position_end') == 1853472 - assert variant_ins_imprecise_3.get('break2_position_start') == 1853472 +def test_convert_intrachromosomal_imprecise_breakend_no_ci(): # breakpoint_1_start > breakpoint_1_end variant_cilen4 = VcfRecordType( id='Sniffle.INS', From f6903c7c2de97f066d3c7dbedee60272678cd721 Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Mon, 25 Apr 2022 13:13:58 -0700 Subject: [PATCH 13/17] drop calls which violate VCF norms --- src/mavis/convert/vcf.py | 41 +++++++++++----------- tests/test_mavis/convert/test_tools_vcf.py | 37 +++++++++++++------ 2 files changed, 48 insertions(+), 30 deletions(-) diff --git a/src/mavis/convert/vcf.py b/src/mavis/convert/vcf.py index 434353af..47d1646c 100644 --- a/src/mavis/convert/vcf.py +++ b/src/mavis/convert/vcf.py @@ -178,7 +178,6 @@ def convert_imprecise_breakend(std_row: Dict, record: List[VcfRecordType], bp_en (std_row['break1_position_start'], std_row['break1_position_end']), (std_row['break2_position_start'], std_row['break2_position_end']), ) - or std_row['break1_position_start'] > std_row['break2_position_start'] ): if 'event_type' in std_row and std_row['event_type'] != 'BND': std_row['break2_position_start'] = max( @@ -190,21 +189,27 @@ def convert_imprecise_breakend(std_row: Dict, record: List[VcfRecordType], bp_en std_row['break1_position_start'] = min( std_row['break1_position_start'], std_row['break2_position_end'] ) - std_row['break2_position_start'] = min( - std_row['break2_position_start'], std_row['break2_position_end'] - ) + if std_row['break1_position_end'] == 0 and std_row['break1_position_start'] == 1: + # addresses cases where pos = 0 and telomeric BND alt syntax https://github.com/bcgsc/mavis/issues/294 + std_row.update({'break1_position_end': 1}) + if std_row['break2_position_end'] == 0 and std_row['break2_position_start'] == 1: + std_row.update({'break2_position_end': 1}) - elif ( - std_row["break1_chromosome"] == std_row["break2_chromosome"] - and std_row['break2_position_start'] > std_row['break2_position_end'] + if std_row["break1_chromosome"] == std_row["break2_chromosome"] and ( + std_row['break2_position_start'] > std_row['break2_position_end'] + or std_row['break1_position_start'] > std_row['break1_position_end'] ): if 'event_type' in std_row and std_row['event_type'] != 'BND': - std_row['break2_position_start'] = min( - std_row['break2_position_start'], std_row['break2_position_end'] - ) - logger.warning( - f'Improper entry. Breakpoint2 start: {std_row["break2_position_start"]} is greater than Breakpoint2 end: {std_row["break2_position_end"]}\n Breakpoint2 start position coerced to breakpoint2 end position.' + logger.error( + f'Improper entry. One of the following breakpoints start is greater than breakpoint end:\n Breakpoint1_start: {std_row["break1_position_start"]}, Breakpoint1_end: {std_row["break1_position_end"]}\n Breakpoint2_start: {std_row["break2_position_start"]}, Breakpoint2_end: {std_row["break2_position_end"]}\n This call has been dropped.' ) + std_row.clear() + + if not None in (record.pos, record.info.get('END')) and record.pos > record.info.get('END'): + logger.error( + f'Improper entry. Starting position ({record.pos}) cannot be greater than ending position ({record.info.get("END")}).\n This call has been dropped.' + ) + std_row.clear() def convert_record(record: VcfRecordType) -> List[Dict]: @@ -283,13 +288,9 @@ def convert_record(record: VcfRecordType) -> List[Dict]: ) else: convert_imprecise_breakend(std_row, record, end) - - if std_row['break1_position_end'] == 0 and std_row['break1_position_start'] == 1: - # addresses cases where pos = 0 and telomeric BND alt syntax https://github.com/bcgsc/mavis/issues/294 - std_row.update({'break1_position_end': 1}) - if std_row['break2_position_end'] == 0 and std_row['break2_position_start'] == 1: - std_row.update({'break2_position_end': 1}) - + print(std_row) + if len(std_row) == 0: + continue try: orient1, orient2 = info['CT'].split('to') connection_type = {'3': ORIENT.LEFT, '5': ORIENT.RIGHT, 'N': ORIENT.NS} @@ -329,7 +330,7 @@ def parse_info(info_field): return info df['info'] = df['INFO'].apply(parse_info) - df['alts'] = df['ALT'].apply(lambda a: str(a).split(',')) + df['alts'] = df['ALT'].apply(lambda a: a.split(',')) rows = [] for _, row in df.iterrows(): diff --git a/tests/test_mavis/convert/test_tools_vcf.py b/tests/test_mavis/convert/test_tools_vcf.py index aa822ed3..a01b3372 100644 --- a/tests/test_mavis/convert/test_tools_vcf.py +++ b/tests/test_mavis/convert/test_tools_vcf.py @@ -110,7 +110,7 @@ def test_convert_intrachromosomal_imprecise_breakend(): assert variant_ins_imprecise_2.get('break2_position_end') == 1853472 assert variant_ins_imprecise_2.get('break2_position_start') == 1853377 - # breakpoint_2_start > breakpoint_2_end + # breakpoint_2_start > breakpoint_2_end (invalid entry) variant_imprecise3 = VcfRecordType( id='mock-INS-imprecise', pos=1853407, @@ -129,12 +129,32 @@ def test_convert_intrachromosomal_imprecise_breakend(): ), ) variant_ins_imprecise_3 = convert_record(variant_imprecise3) - assert len(variant_ins_imprecise_3) == 1 - variant_ins_imprecise_3 = variant_ins_imprecise_3[0] - assert variant_ins_imprecise_3.get('break2_position_end') == 1853472 - assert variant_ins_imprecise_3.get('break2_position_start') == 1853472 + print(variant_ins_imprecise_3) + assert len(variant_ins_imprecise_3) == 0 - # breakpoint_1_start > breakpoint_1_end + # breakpoint_1_start > breakpoint_1_end (invalid entry) + variant_imprecise4 = VcfRecordType( + id='mock-INS-imprecise', + pos=1853407, + chrom='chr5', + alts=['AGG'], + ref='A', + info=VcfInfoType( + CHR2="chr5", + IMPRECISE=True, + SVMETHOD="Snifflesv1.0.11", + SVTYPE="INS", + SUPTYPE="None", + STRANDS="None", + CIPOS=(999, 100), + CILEN=(65, 65), + ), + ) + variant_ins_imprecise_4 = convert_record(variant_imprecise4) + print(variant_ins_imprecise_4) + assert len(variant_ins_imprecise_4) == 0 + + # pos > end (invalid entry) variant_cilen4 = VcfRecordType( id='Sniffle.INS', pos=11184, @@ -149,7 +169,4 @@ def test_convert_intrachromosomal_imprecise_breakend(): ), ) variant_ins_imprecise_4 = convert_record(variant_cilen4) - assert len(variant_ins_imprecise_4) == 1 - variant_ins_imprecise_4 = variant_ins_imprecise_4[0] - assert variant_ins_imprecise_4.get('break2_position_end') == 11183 - assert variant_ins_imprecise_4.get('break1_position_end') == 11183 + assert len(variant_ins_imprecise_4) == 0 From 7528e4a897135dd4b9bdf4f6e0c61c798329fe22 Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Mon, 25 Apr 2022 15:01:00 -0700 Subject: [PATCH 14/17] drop calls that violate VCF standards in convert vcf --- src/mavis/convert/vcf.py | 1 - tests/test_mavis/convert/test_convert.py | 11 ++ tests/test_mavis/convert/test_tools_vcf.py | 138 +++++++++------------ 3 files changed, 68 insertions(+), 82 deletions(-) diff --git a/src/mavis/convert/vcf.py b/src/mavis/convert/vcf.py index 47d1646c..f5ace9d0 100644 --- a/src/mavis/convert/vcf.py +++ b/src/mavis/convert/vcf.py @@ -288,7 +288,6 @@ def convert_record(record: VcfRecordType) -> List[Dict]: ) else: convert_imprecise_breakend(std_row, record, end) - print(std_row) if len(std_row) == 0: continue try: diff --git a/tests/test_mavis/convert/test_convert.py b/tests/test_mavis/convert/test_convert.py index 4470bc66..44b8aa24 100644 --- a/tests/test_mavis/convert/test_convert.py +++ b/tests/test_mavis/convert/test_convert.py @@ -1,3 +1,4 @@ +import itertools import os import shutil import sys @@ -47,6 +48,16 @@ def run_main(self, inputfile, file_type, strand_specific=False): result.setdefault(pair.data['tracking_id'], []).append(pair) return result + def test_straglr(self): + result = self.run_main(get_data('straglr.bed'), SUPPORTED_TOOL.STRAGLR, False) + assert len(result) == 9 + for bpp in itertools.chain(*result.values()): + assert bpp.break1.chr == bpp.break2.chr + assert bpp.break1.start == bpp.break2.start + assert bpp.break1.end == bpp.break2.end + assert bpp.event_type == SVTYPE.INS + assert bpp.untemplated_seq is None + def test_chimerascan(self): self.run_main(get_data('chimerascan_output.bedpe'), SUPPORTED_TOOL.CHIMERASCAN, False) diff --git a/tests/test_mavis/convert/test_tools_vcf.py b/tests/test_mavis/convert/test_tools_vcf.py index a01b3372..2c5dcd1f 100644 --- a/tests/test_mavis/convert/test_tools_vcf.py +++ b/tests/test_mavis/convert/test_tools_vcf.py @@ -1,3 +1,4 @@ +import pytest from mavis.convert import SUPPORTED_TOOL, _convert_tool_row from mavis.convert.vcf import VcfInfoType, VcfRecordType, convert_record, pandas_vcf @@ -61,100 +62,75 @@ def test_convert_telomeric_region(): assert imprecise_records.get('break1_chromosome') == 'chr14_KI270722v1_random' -def test_convert_intrachromosomal_imprecise_breakend(): - # breakpoint_2_start < breakpoint_1_start - variant_imprecise = VcfRecordType( - id='vcf-cuteSV.INS', - pos=1853407, - chrom='chr5', - alts=['AGG'], - ref='A', - info=VcfInfoType( - CHR2="chr5", - IMPRECISE=True, - SVMETHOD="cuteSV-1.0.12", - SVTYPE="INS", - CIPOS=(-30, 30), - CILEN=(-65, 65), - ), - ) - variant_ins_imprecise = convert_record(variant_imprecise) - assert len(variant_ins_imprecise) == 1 - variant_ins_imprecise = variant_ins_imprecise[0] - assert variant_ins_imprecise.get('break2_position_start') == 1853377 - assert variant_ins_imprecise.get('break2_position_end') == 1853472 +TEST_POS = 1853407 - # breakpoint_1_end > breakpoint_2_end - variant_imprecise2 = VcfRecordType( - id='vcf-cuteSV.INS', - pos=1853407, - chrom='chr5', - alts=['AGG'], - ref='A', - info=VcfInfoType( - CHR2="chr5", - IMPRECISE=True, - SVMETHOD="cuteSV-1.0.12", - SVTYPE="INS", - SUPTYPE="None", - STRANDS="None", - CIPOS=(-30, 9999), - CILEN=(-65, 65), - ), - ) - variant_ins_imprecise_2 = convert_record(variant_imprecise2) - assert len(variant_ins_imprecise_2) == 1 - variant_ins_imprecise_2 = variant_ins_imprecise_2[0] - assert variant_ins_imprecise_2.get('break1_position_start') == 1853377 - assert variant_ins_imprecise_2.get('break1_position_end') == 1853472 - assert variant_ins_imprecise_2.get('break2_position_end') == 1853472 - assert variant_ins_imprecise_2.get('break2_position_start') == 1853377 - # breakpoint_2_start > breakpoint_2_end (invalid entry) - variant_imprecise3 = VcfRecordType( - id='mock-INS-imprecise', - pos=1853407, +@pytest.mark.parametrize( + 'pos,break1_ci,break2_ci,break1,break2,ids', + [ + [ + TEST_POS, + (-30, 30), + (-65, 65), + (TEST_POS - 30, TEST_POS + 30), + (TEST_POS - 30, TEST_POS + 65), + 'vcf-cuteSV.INS.breakpoint_2_start < breakpoint_1_start', + ], + [ + TEST_POS, + (-30, 99999), + (-10, 65), + (TEST_POS - 30, TEST_POS + 65), + (TEST_POS - 10, TEST_POS + 65), + 'vcf-cuteSV.INS.breakpoint_1_end > breakpoint_2_end', + ], + [ + TEST_POS, + (-30, 99999), + (70, 65), + (TEST_POS - 30, TEST_POS + 65), + (TEST_POS + 65, TEST_POS + 65), + 'vcf-cuteSV.INS.breakpoint_2_start > breakpoint_2_end', + ], + ], + ids=[ + 'breakpoint_2_start < breakpoint_1_start', + 'breakpoint_1_end > breakpoint_2_end', + 'breakpoint_2_start > breakpoint_2_end', + ], +) +def test_convert_intrachromosomal_imprecise_breakend( + pos, break1_ci, break2_ci, break1, break2, ids +): + variant_vcf = VcfRecordType( + id=ids, + pos=pos, chrom='chr5', alts=['AGG'], ref='A', info=VcfInfoType( CHR2="chr5", IMPRECISE=True, - SVMETHOD="Snifflesv1.0.11", + SVMETHOD="cuteSV-1.0.12", SVTYPE="INS", - SUPTYPE="None", - STRANDS="None", - CIPOS=(-30, 9999), - CILEN=(70, 65), + CIPOS=break1_ci, + CILEN=break2_ci, ), ) - variant_ins_imprecise_3 = convert_record(variant_imprecise3) - print(variant_ins_imprecise_3) - assert len(variant_ins_imprecise_3) == 0 + result = convert_record(variant_vcf) + if ids == "vcf-cuteSV.INS.breakpoint_2_start > breakpoint_2_end": + assert len(result) == 0 + else: + assert len(result) == 1 + variant = result[0] + assert variant.get('break1_position_start') == break1[0] + assert variant.get('break1_position_end') == break1[1] + assert variant.get('break2_position_start') == break2[0] + assert variant.get('break2_position_end') == break2[1] - # breakpoint_1_start > breakpoint_1_end (invalid entry) - variant_imprecise4 = VcfRecordType( - id='mock-INS-imprecise', - pos=1853407, - chrom='chr5', - alts=['AGG'], - ref='A', - info=VcfInfoType( - CHR2="chr5", - IMPRECISE=True, - SVMETHOD="Snifflesv1.0.11", - SVTYPE="INS", - SUPTYPE="None", - STRANDS="None", - CIPOS=(999, 100), - CILEN=(65, 65), - ), - ) - variant_ins_imprecise_4 = convert_record(variant_imprecise4) - print(variant_ins_imprecise_4) - assert len(variant_ins_imprecise_4) == 0 - # pos > end (invalid entry) +def test_convert_intrachromosomal_imprecise_breakend_no_ci(): + # breakpoint_1_start > breakpoint_1_end variant_cilen4 = VcfRecordType( id='Sniffle.INS', pos=11184, From f769ce38183307db972982bf42c8c84495998212 Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Mon, 25 Apr 2022 17:04:45 -0700 Subject: [PATCH 15/17] intrachromosomal test change --- src/mavis/convert/vcf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mavis/convert/vcf.py b/src/mavis/convert/vcf.py index f5ace9d0..30ac2fe9 100644 --- a/src/mavis/convert/vcf.py +++ b/src/mavis/convert/vcf.py @@ -205,7 +205,7 @@ def convert_imprecise_breakend(std_row: Dict, record: List[VcfRecordType], bp_en ) std_row.clear() - if not None in (record.pos, record.info.get('END')) and record.pos > record.info.get('END'): + if not None in (record.pos, record.info.get('END')) and record.pos > record.info.get('END') and std_row["break1_chromosome"] == std_row["break2_chromosome"]: logger.error( f'Improper entry. Starting position ({record.pos}) cannot be greater than ending position ({record.info.get("END")}).\n This call has been dropped.' ) From 197a8147d68716ac8910d3d175acec079f411108 Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Mon, 25 Apr 2022 18:15:12 -0700 Subject: [PATCH 16/17] reformmated file --- src/mavis/convert/vcf.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/mavis/convert/vcf.py b/src/mavis/convert/vcf.py index 30ac2fe9..66cbdf52 100644 --- a/src/mavis/convert/vcf.py +++ b/src/mavis/convert/vcf.py @@ -205,7 +205,11 @@ def convert_imprecise_breakend(std_row: Dict, record: List[VcfRecordType], bp_en ) std_row.clear() - if not None in (record.pos, record.info.get('END')) and record.pos > record.info.get('END') and std_row["break1_chromosome"] == std_row["break2_chromosome"]: + if ( + not None in (record.pos, record.info.get('END')) + and record.pos > record.info.get('END') + and std_row["break1_chromosome"] == std_row["break2_chromosome"] + ): logger.error( f'Improper entry. Starting position ({record.pos}) cannot be greater than ending position ({record.info.get("END")}).\n This call has been dropped.' ) From 9e64e59b934bb5c4485d4da390f1ebd90a79ac03 Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Tue, 26 Apr 2022 12:03:44 -0700 Subject: [PATCH 17/17] style changes and change error handling logic --- src/mavis/convert/vcf.py | 26 +++++----- tests/test_mavis/convert/test_tools_vcf.py | 56 ++++++++++++++++------ 2 files changed, 53 insertions(+), 29 deletions(-) diff --git a/src/mavis/convert/vcf.py b/src/mavis/convert/vcf.py index 66cbdf52..c6447af5 100644 --- a/src/mavis/convert/vcf.py +++ b/src/mavis/convert/vcf.py @@ -147,6 +147,7 @@ def convert_imprecise_breakend(std_row: Dict, record: List[VcfRecordType], bp_en break1 ------------------------==----------------------------------- break2 -----------------------========------------------------------ """ + is_intrachromosomal = std_row["break1_chromosome"] == std_row["break2_chromosome"] if record.info.get( 'CILEN' @@ -173,13 +174,13 @@ def convert_imprecise_breakend(std_row: Dict, record: List[VcfRecordType], bp_en } ) - if std_row["break1_chromosome"] == std_row["break2_chromosome"] and ( + if is_intrachromosomal and ( Interval.overlaps( (std_row['break1_position_start'], std_row['break1_position_end']), (std_row['break2_position_start'], std_row['break2_position_end']), ) ): - if 'event_type' in std_row and std_row['event_type'] != 'BND': + if std_row.get('event_type') != 'BND': std_row['break2_position_start'] = max( std_row['break1_position_start'], std_row['break2_position_start'] ) @@ -195,25 +196,23 @@ def convert_imprecise_breakend(std_row: Dict, record: List[VcfRecordType], bp_en if std_row['break2_position_end'] == 0 and std_row['break2_position_start'] == 1: std_row.update({'break2_position_end': 1}) - if std_row["break1_chromosome"] == std_row["break2_chromosome"] and ( + if is_intrachromosomal and ( std_row['break2_position_start'] > std_row['break2_position_end'] or std_row['break1_position_start'] > std_row['break1_position_end'] ): if 'event_type' in std_row and std_row['event_type'] != 'BND': - logger.error( - f'Improper entry. One of the following breakpoints start is greater than breakpoint end:\n Breakpoint1_start: {std_row["break1_position_start"]}, Breakpoint1_end: {std_row["break1_position_end"]}\n Breakpoint2_start: {std_row["break2_position_start"]}, Breakpoint2_end: {std_row["break2_position_end"]}\n This call has been dropped.' + raise ValueError( + f'Improper entry. One of the following breakpoints start is greater than breakpoint end: Breakpoint1_start: {std_row["break1_position_start"]}, Breakpoint1_end: {std_row["break1_position_end"]} Breakpoint2_start: {std_row["break2_position_start"]}, Breakpoint2_end: {std_row["break2_position_end"]} This call has been dropped.' ) - std_row.clear() if ( - not None in (record.pos, record.info.get('END')) + None not in (record.pos, record.info.get('END')) and record.pos > record.info.get('END') - and std_row["break1_chromosome"] == std_row["break2_chromosome"] + and is_intrachromosomal ): - logger.error( - f'Improper entry. Starting position ({record.pos}) cannot be greater than ending position ({record.info.get("END")}).\n This call has been dropped.' + raise ValueError( + f'Improper entry. Starting position ({record.pos}) cannot be greater than ending position ({record.info.get("END")}). This call has been dropped.' ) - std_row.clear() def convert_record(record: VcfRecordType) -> List[Dict]: @@ -292,8 +291,7 @@ def convert_record(record: VcfRecordType) -> List[Dict]: ) else: convert_imprecise_breakend(std_row, record, end) - if len(std_row) == 0: - continue + try: orient1, orient2 = info['CT'].split('to') connection_type = {'3': ORIENT.LEFT, '5': ORIENT.RIGHT, 'N': ORIENT.NS} @@ -413,6 +411,6 @@ def convert_file(input_file: str) -> List[Dict]: for variant_record in convert_pandas_rows_to_variants(data): try: rows.extend(convert_record(variant_record)) - except NotImplementedError as err: + except (NotImplementedError, ValueError) as err: logging.warning(str(err)) return rows diff --git a/tests/test_mavis/convert/test_tools_vcf.py b/tests/test_mavis/convert/test_tools_vcf.py index 2c5dcd1f..b1cf72bd 100644 --- a/tests/test_mavis/convert/test_tools_vcf.py +++ b/tests/test_mavis/convert/test_tools_vcf.py @@ -84,6 +84,42 @@ def test_convert_telomeric_region(): (TEST_POS - 10, TEST_POS + 65), 'vcf-cuteSV.INS.breakpoint_1_end > breakpoint_2_end', ], + ], + ids=[ + 'breakpoint_2_start < breakpoint_1_start', + 'breakpoint_1_end > breakpoint_2_end', + ], +) +def test_convert_intrachromosomal_imprecise_breakend( + pos, break1_ci, break2_ci, break1, break2, ids +): + variant_vcf = VcfRecordType( + id=ids, + pos=pos, + chrom='chr5', + alts=['AGG'], + ref='A', + info=VcfInfoType( + CHR2="chr5", + IMPRECISE=True, + SVMETHOD="cuteSV-1.0.12", + SVTYPE="INS", + CIPOS=break1_ci, + CILEN=break2_ci, + ), + ) + result = convert_record(variant_vcf) + assert len(result) == 1 + variant = result[0] + assert variant.get('break1_position_start') == break1[0] + assert variant.get('break1_position_end') == break1[1] + assert variant.get('break2_position_start') == break2[0] + assert variant.get('break2_position_end') == break2[1] + + +@pytest.mark.parametrize( + 'pos,break1_ci,break2_ci,break1,break2,ids', + [ [ TEST_POS, (-30, 99999), @@ -94,12 +130,10 @@ def test_convert_telomeric_region(): ], ], ids=[ - 'breakpoint_2_start < breakpoint_1_start', - 'breakpoint_1_end > breakpoint_2_end', 'breakpoint_2_start > breakpoint_2_end', ], ) -def test_convert_intrachromosomal_imprecise_breakend( +def test_error_on_convert_intrachromosomal_imprecise_breakend( pos, break1_ci, break2_ci, break1, break2, ids ): variant_vcf = VcfRecordType( @@ -117,16 +151,8 @@ def test_convert_intrachromosomal_imprecise_breakend( CILEN=break2_ci, ), ) - result = convert_record(variant_vcf) - if ids == "vcf-cuteSV.INS.breakpoint_2_start > breakpoint_2_end": - assert len(result) == 0 - else: - assert len(result) == 1 - variant = result[0] - assert variant.get('break1_position_start') == break1[0] - assert variant.get('break1_position_end') == break1[1] - assert variant.get('break2_position_start') == break2[0] - assert variant.get('break2_position_end') == break2[1] + with pytest.raises(ValueError): + convert_record(variant_vcf) def test_convert_intrachromosomal_imprecise_breakend_no_ci(): @@ -144,5 +170,5 @@ def test_convert_intrachromosomal_imprecise_breakend_no_ci(): END=11183, ), ) - variant_ins_imprecise_4 = convert_record(variant_cilen4) - assert len(variant_ins_imprecise_4) == 0 + with pytest.raises(ValueError): + convert_record(variant_cilen4)