Skip to content

Commit

Permalink
Merge pull request #553 from Sage-Bionetworks/GEN_1067_warn_for_uncha…
Browse files Browse the repository at this point in the history
…nged_mutations

[GEN-1067] add functionality to warn for identical ref and tsa2
  • Loading branch information
danlu1 authored Mar 14, 2024
2 parents 9c10710 + b636835 commit d190e61
Show file tree
Hide file tree
Showing 2 changed files with 128 additions and 40 deletions.
16 changes: 13 additions & 3 deletions genie_registry/maf.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@
logger = logging.getLogger(__name__)


def _check_tsa1_tsa2(df):
def _check_allele_col_validity(df):
"""If maf file has both TSA1 and TSA2,
TSA1 must equal REF, or TSA1 must equal TSA2.
TSA1 must equal REF, or TSA1 must equal TSA2, and REF must not equal TSA2
"""
tsa2_col_exist = process_functions.checkColExist(df, "TUMOR_SEQ_ALLELE2")
tsa1_col_exist = process_functions.checkColExist(df, "TUMOR_SEQ_ALLELE1")
Expand All @@ -29,6 +29,16 @@ def _check_tsa1_tsa2(df):
"All values in TUMOR_SEQ_ALLELE1 must match all values in "
"REFERENCE_ALLELE or all values in TUMOR_SEQ_ALLELE2.\n"
)
if (
tsa2_col_exist
and ref_col_exist
and not df.query("REFERENCE_ALLELE == TUMOR_SEQ_ALLELE2").empty
):
error = (
f"{error}maf: Contains instances where values in REFERENCE_ALLELE match values in TUMOR_SEQ_ALLELE2. "
"This is invalid. Please correct.\n"
)
row_index = df.query("REFERENCE_ALLELE == TUMOR_SEQ_ALLELE2").index.values
return error


Expand Down Expand Up @@ -260,7 +270,7 @@ def _validate(self, mutationDF):
# "start with 'chr' or any 'WT' values.\n"
# )

error = _check_tsa1_tsa2(mutationDF)
error = _check_allele_col_validity(mutationDF)
total_error.write(error)

if process_functions.checkColExist(mutationDF, "TUMOR_SAMPLE_BARCODE"):
Expand Down
152 changes: 115 additions & 37 deletions tests/test_maf.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from cmath import nan
from unittest.mock import mock_open, patch

import pandas as pd
Expand Down Expand Up @@ -81,7 +82,7 @@ def test_firstcolumn_validation(maf_class):
"N_DEPTH": [1, 2, 3, 4, 3],
"N_REF_COUNT": [1, 2, 3, 4, 3],
"N_ALT_COUNT": [1, 2, 3, 4, 3],
"TUMOR_SEQ_ALLELE2": ["A", "A", "A", "A", "A"],
"TUMOR_SEQ_ALLELE2": ["T", "A", "A", "A", "A"],
}
)
order = [
Expand Down Expand Up @@ -196,7 +197,7 @@ def test_invalid_validation(maf_class):
)

with patch.object(
genie_registry.maf, "_check_tsa1_tsa2", return_value=""
genie_registry.maf, "_check_allele_col_validity", return_value=""
) as check_tsa1_tsa2:
error, warning = maf_class._validate(mafDf)
check_tsa1_tsa2.assert_called_once_with(mafDf)
Expand Down Expand Up @@ -241,47 +242,124 @@ def test_error__check_allele_col():
assert warning == ""


def test_invalid__check_tsa1_tsa2():
"""Test the scenario in which maf file has TSA1 and TSA2 and fails"""
df = pd.DataFrame(
dict(
REFERENCE_ALLELE=["A", "A", "A"],
TUMOR_SEQ_ALLELE1=["B", "B", "B"],
TUMOR_SEQ_ALLELE2=["C", "C", "C"],
)
)
error = genie_registry.maf._check_tsa1_tsa2(df)
assert error == (
"maf: Contains both "
"TUMOR_SEQ_ALLELE1 and TUMOR_SEQ_ALLELE2 columns. "
"All values in TUMOR_SEQ_ALLELE1 must match all values in "
"REFERENCE_ALLELE or all values in TUMOR_SEQ_ALLELE2.\n"
)


@pytest.mark.parametrize(
"df",
"test_df,expected_error",
[
pd.DataFrame(
dict(
REFERENCE_ALLELE=["A", "A", "A"],
TUMOR_SEQ_ALLELE1=["C", "C", "C"],
TUMOR_SEQ_ALLELE2=["C", "C", "C"],
)
(
pd.DataFrame(
dict(
REFERENCE_ALLELE=["A", "A", "A"],
TUMOR_SEQ_ALLELE1=["C", "C", "C"],
TUMOR_SEQ_ALLELE2=["C", "C", "C"],
)
),
"",
),
pd.DataFrame(
dict(
REFERENCE_ALLELE=["C", "C", "C"],
TUMOR_SEQ_ALLELE1=["C", "C", "C"],
TUMOR_SEQ_ALLELE2=["A", "A", "A"],
)
(
pd.DataFrame(
dict(
REFERENCE_ALLELE=["C", "C", "C"],
TUMOR_SEQ_ALLELE1=["C", "C", "C"],
TUMOR_SEQ_ALLELE2=["A", "A", "A"],
)
),
"",
),
(
pd.DataFrame(
dict(
REFERENCE_ALLELE=["A", "A", "A"],
TUMOR_SEQ_ALLELE1=["B", "B", "B"],
TUMOR_SEQ_ALLELE2=["C", "C", "C"],
)
),
"maf: Contains both "
"TUMOR_SEQ_ALLELE1 and TUMOR_SEQ_ALLELE2 columns. "
"All values in TUMOR_SEQ_ALLELE1 must match all values in "
"REFERENCE_ALLELE or all values in TUMOR_SEQ_ALLELE2.\n",
),
(
pd.DataFrame(
dict(
REFERENCE_ALLELE=["A", "A", "A"],
TUMOR_SEQ_ALLELE1=["A", "A", "A"],
TUMOR_SEQ_ALLELE2=["A", "C", "C"],
)
),
"maf: Contains instances where values in REFERENCE_ALLELE match values in TUMOR_SEQ_ALLELE2. "
"This is invalid. Please correct.\n",
),
(
pd.DataFrame(
dict(
REFERENCE_ALLELE=["A", "A", "A"],
TUMOR_SEQ_ALLELE2=["A", "C", "C"],
)
),
"maf: Contains instances where values in REFERENCE_ALLELE match values in TUMOR_SEQ_ALLELE2. "
"This is invalid. Please correct.\n",
),
(
pd.DataFrame(
dict(
REFERENCE_ALLELE=["A", "A", "A"],
TUMOR_SEQ_ALLELE2=["C", "C", "C"],
)
),
"",
),
(
pd.DataFrame(
dict(
TUMOR_SEQ_ALLELE1=["C", "C", "C"],
)
),
"",
),
(
pd.DataFrame(
dict(
REFERENCE_ALLELE=["A", "A", "A"],
TUMOR_SEQ_ALLELE1=["B", "B", "B"],
TUMOR_SEQ_ALLELE2=["A", "C", "C"],
)
),
"maf: Contains both "
"TUMOR_SEQ_ALLELE1 and TUMOR_SEQ_ALLELE2 columns. "
"All values in TUMOR_SEQ_ALLELE1 must match all values in "
"REFERENCE_ALLELE or all values in TUMOR_SEQ_ALLELE2.\n"
"maf: Contains instances where values in REFERENCE_ALLELE match values in TUMOR_SEQ_ALLELE2. "
"This is invalid. Please correct.\n",
),
(
pd.DataFrame(
dict(
REFERENCE_ALLELE=[nan, "A", "A"],
TUMOR_SEQ_ALLELE1=["B", nan, "B"],
TUMOR_SEQ_ALLELE2=[nan, "C", "C"],
)
),
"maf: Contains both "
"TUMOR_SEQ_ALLELE1 and TUMOR_SEQ_ALLELE2 columns. "
"All values in TUMOR_SEQ_ALLELE1 must match all values in "
"REFERENCE_ALLELE or all values in TUMOR_SEQ_ALLELE2.\n",
),
],
ids=[
"matching_tsa1_tsa2",
"matching_tsa1_ref",
"invalid_tsa1",
"identical_ref_tsa2",
"identical_ref_tsa2_missing_tsa1",
"valid_ref_tsa2_missing_tsa1",
"missing_tsa2_ref",
"invalid_tsa1_identical_ref_tsa2",
"NAs_in_allele_cole",
],
)
def test_valid__check_tsa1_tsa2(df):
"""Test valid TSA1 and TSA2"""
error = genie_registry.maf._check_tsa1_tsa2(df)
assert error == ""
def test__check_allele_col_validity(test_df, expected_error):
error = genie_registry.maf._check_allele_col_validity(test_df)
assert error == expected_error


def test_that__cross_validate_does_not_read_files_if_no_clinical_files(maf_class):
Expand Down

0 comments on commit d190e61

Please sign in to comment.