Skip to content

Commit

Permalink
add functionality to warn for identical ref and tsa2
Browse files Browse the repository at this point in the history
  • Loading branch information
danlu1 committed Mar 8, 2024
1 parent 429dafb commit 4a04b5e
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 2 deletions.
5 changes: 4 additions & 1 deletion genie_registry/maf.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

def _check_tsa1_tsa2(df):
"""If maf file has both TSA1 and TSA2,
TSA1 must equal REF, or TSA1 must equal TSA2.
TSA1 must equal REF, or TSA1 must equal TSA2, and REF must not equal TSA2
"""
tsa2_col_exist = process_functions.checkColExist(df, "TUMOR_SEQ_ALLELE2")
tsa1_col_exist = process_functions.checkColExist(df, "TUMOR_SEQ_ALLELE1")
Expand All @@ -29,6 +29,9 @@ def _check_tsa1_tsa2(df):
"All values in TUMOR_SEQ_ALLELE1 must match all values in "
"REFERENCE_ALLELE or all values in TUMOR_SEQ_ALLELE2.\n"
)
if tsa2_col_exist and ref_col_exist and not df.query('REFERENCE_ALLELE == TUMOR_SEQ_ALLELE2').empty:
error =(f"{error}REFERENCE_ALLELE should not equal to TUMOR_SEQ_ALLELE2. "
f"Please check row: {', '.join(str(e+1) for e in df.query('REFERENCE_ALLELE == TUMOR_SEQ_ALLELE2').index.values)}.\n")
return error


Expand Down
35 changes: 34 additions & 1 deletion tests/test_maf.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def test_firstcolumn_validation(maf_class):
"N_DEPTH": [1, 2, 3, 4, 3],
"N_REF_COUNT": [1, 2, 3, 4, 3],
"N_ALT_COUNT": [1, 2, 3, 4, 3],
"TUMOR_SEQ_ALLELE2": ["A", "A", "A", "A", "A"],
"TUMOR_SEQ_ALLELE2": ["T", "A", "A", "A", "A"],
}
)
order = [
Expand Down Expand Up @@ -258,6 +258,39 @@ def test_invalid__check_tsa1_tsa2():
"REFERENCE_ALLELE or all values in TUMOR_SEQ_ALLELE2.\n"
)

def test_invalid__check_ref_tsa2():
"""Test the scenario in which maf file has identical REF and tsa2 and fails"""
df = pd.DataFrame(
dict(
REFERENCE_ALLELE=["A", "A", "A"],
TUMOR_SEQ_ALLELE1=["A", "A", "A"],
TUMOR_SEQ_ALLELE2=["A", "C", "C"],
)
)
error = genie_registry.maf._check_tsa1_tsa2(df)
assert error == (
"REFERENCE_ALLELE should not equal to TUMOR_SEQ_ALLELE2. "
"Please check row: 1.\n"
)

def test_invalid__check_ref_tsa1_tsa2():
"""Test the scenario in which maf file has TSA1 and TSA2 and fails"""
df = pd.DataFrame(
dict(
REFERENCE_ALLELE=["A", "A", "A"],
TUMOR_SEQ_ALLELE1=["B", "B", "B"],
TUMOR_SEQ_ALLELE2=["A", "C", "C"],
)
)
error = genie_registry.maf._check_tsa1_tsa2(df)
assert error == (
"maf: Contains both "
"TUMOR_SEQ_ALLELE1 and TUMOR_SEQ_ALLELE2 columns. "
"All values in TUMOR_SEQ_ALLELE1 must match all values in "
"REFERENCE_ALLELE or all values in TUMOR_SEQ_ALLELE2.\n"
"REFERENCE_ALLELE should not equal to TUMOR_SEQ_ALLELE2. "
"Please check row: 1.\n"
)

@pytest.mark.parametrize(
"df",
Expand Down

0 comments on commit 4a04b5e

Please sign in to comment.