From 4a04b5eeac60ef2fc1a40ebaf43afea98f2040a4 Mon Sep 17 00:00:00 2001 From: Dan Lu Date: Fri, 8 Mar 2024 15:09:36 -0800 Subject: [PATCH 1/7] add functionality to warn for identical ref and tsa2 --- genie_registry/maf.py | 5 ++++- tests/test_maf.py | 35 ++++++++++++++++++++++++++++++++++- 2 files changed, 38 insertions(+), 2 deletions(-) diff --git a/genie_registry/maf.py b/genie_registry/maf.py index aeb8246a..5d313dab 100644 --- a/genie_registry/maf.py +++ b/genie_registry/maf.py @@ -13,7 +13,7 @@ def _check_tsa1_tsa2(df): """If maf file has both TSA1 and TSA2, - TSA1 must equal REF, or TSA1 must equal TSA2. + TSA1 must equal REF, or TSA1 must equal TSA2, and REF must not equal TSA2 """ tsa2_col_exist = process_functions.checkColExist(df, "TUMOR_SEQ_ALLELE2") tsa1_col_exist = process_functions.checkColExist(df, "TUMOR_SEQ_ALLELE1") @@ -29,6 +29,9 @@ def _check_tsa1_tsa2(df): "All values in TUMOR_SEQ_ALLELE1 must match all values in " "REFERENCE_ALLELE or all values in TUMOR_SEQ_ALLELE2.\n" ) + if tsa2_col_exist and ref_col_exist and not df.query('REFERENCE_ALLELE == TUMOR_SEQ_ALLELE2').empty: + error =(f"{error}REFERENCE_ALLELE should not equal to TUMOR_SEQ_ALLELE2. " + f"Please check row: {', '.join(str(e+1) for e in df.query('REFERENCE_ALLELE == TUMOR_SEQ_ALLELE2').index.values)}.\n") return error diff --git a/tests/test_maf.py b/tests/test_maf.py index b86b202b..6cdd0f8a 100644 --- a/tests/test_maf.py +++ b/tests/test_maf.py @@ -81,7 +81,7 @@ def test_firstcolumn_validation(maf_class): "N_DEPTH": [1, 2, 3, 4, 3], "N_REF_COUNT": [1, 2, 3, 4, 3], "N_ALT_COUNT": [1, 2, 3, 4, 3], - "TUMOR_SEQ_ALLELE2": ["A", "A", "A", "A", "A"], + "TUMOR_SEQ_ALLELE2": ["T", "A", "A", "A", "A"], } ) order = [ @@ -258,6 +258,39 @@ def test_invalid__check_tsa1_tsa2(): "REFERENCE_ALLELE or all values in TUMOR_SEQ_ALLELE2.\n" ) +def test_invalid__check_ref_tsa2(): + """Test the scenario in which maf file has identical REF and tsa2 and fails""" + df = pd.DataFrame( + dict( + REFERENCE_ALLELE=["A", "A", "A"], + TUMOR_SEQ_ALLELE1=["A", "A", "A"], + TUMOR_SEQ_ALLELE2=["A", "C", "C"], + ) + ) + error = genie_registry.maf._check_tsa1_tsa2(df) + assert error == ( + "REFERENCE_ALLELE should not equal to TUMOR_SEQ_ALLELE2. " + "Please check row: 1.\n" + ) + +def test_invalid__check_ref_tsa1_tsa2(): + """Test the scenario in which maf file has TSA1 and TSA2 and fails""" + df = pd.DataFrame( + dict( + REFERENCE_ALLELE=["A", "A", "A"], + TUMOR_SEQ_ALLELE1=["B", "B", "B"], + TUMOR_SEQ_ALLELE2=["A", "C", "C"], + ) + ) + error = genie_registry.maf._check_tsa1_tsa2(df) + assert error == ( + "maf: Contains both " + "TUMOR_SEQ_ALLELE1 and TUMOR_SEQ_ALLELE2 columns. " + "All values in TUMOR_SEQ_ALLELE1 must match all values in " + "REFERENCE_ALLELE or all values in TUMOR_SEQ_ALLELE2.\n" + "REFERENCE_ALLELE should not equal to TUMOR_SEQ_ALLELE2. " + "Please check row: 1.\n" + ) @pytest.mark.parametrize( "df", From 8b49e1135e5a97cc57133257d2a35089d04c7f9c Mon Sep 17 00:00:00 2001 From: Dan Lu <90745557+danlu1@users.noreply.github.com> Date: Mon, 11 Mar 2024 11:03:34 -0700 Subject: [PATCH 2/7] Update maf.py remove extra trailing whitespaces --- genie_registry/maf.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/genie_registry/maf.py b/genie_registry/maf.py index 5d313dab..ce7df29b 100644 --- a/genie_registry/maf.py +++ b/genie_registry/maf.py @@ -30,11 +30,9 @@ def _check_tsa1_tsa2(df): "REFERENCE_ALLELE or all values in TUMOR_SEQ_ALLELE2.\n" ) if tsa2_col_exist and ref_col_exist and not df.query('REFERENCE_ALLELE == TUMOR_SEQ_ALLELE2').empty: - error =(f"{error}REFERENCE_ALLELE should not equal to TUMOR_SEQ_ALLELE2. " - f"Please check row: {', '.join(str(e+1) for e in df.query('REFERENCE_ALLELE == TUMOR_SEQ_ALLELE2').index.values)}.\n") + error =(f"{error}REFERENCE_ALLELE should not equal to TUMOR_SEQ_ALLELE2. Please check row: {', '.join(str(e+1) for e in df.query('REFERENCE_ALLELE == TUMOR_SEQ_ALLELE2').index.values)}.\n") return error - def _check_allele_col(df, col): """ Check the Allele column is correctly formatted. From ea7c91591eb10b4e8873d5dda67ed1ffecfe76af Mon Sep 17 00:00:00 2001 From: Dan Lu Date: Mon, 11 Mar 2024 15:02:39 -0700 Subject: [PATCH 3/7] update error message and save row index as variable --- genie_registry/maf.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/genie_registry/maf.py b/genie_registry/maf.py index ce7df29b..f54b1353 100644 --- a/genie_registry/maf.py +++ b/genie_registry/maf.py @@ -29,10 +29,19 @@ def _check_tsa1_tsa2(df): "All values in TUMOR_SEQ_ALLELE1 must match all values in " "REFERENCE_ALLELE or all values in TUMOR_SEQ_ALLELE2.\n" ) - if tsa2_col_exist and ref_col_exist and not df.query('REFERENCE_ALLELE == TUMOR_SEQ_ALLELE2').empty: - error =(f"{error}REFERENCE_ALLELE should not equal to TUMOR_SEQ_ALLELE2. Please check row: {', '.join(str(e+1) for e in df.query('REFERENCE_ALLELE == TUMOR_SEQ_ALLELE2').index.values)}.\n") + if ( + tsa2_col_exist + and ref_col_exist + and not df.query("REFERENCE_ALLELE == TUMOR_SEQ_ALLELE2").empty + ): + error = ( + "maf: Contains instances where values in REFERENCE_ALLELE match values in TUMOR_SEQ_ALLELE2. " + "This is invalid. Please correct." + ) + row_index = df.query("REFERENCE_ALLELE == TUMOR_SEQ_ALLELE2").index.values return error + def _check_allele_col(df, col): """ Check the Allele column is correctly formatted. From 083ab0fbf80c5ced6f5d8c9caa8a33006e2c23b7 Mon Sep 17 00:00:00 2001 From: Dan Lu Date: Tue, 12 Mar 2024 12:35:50 -0700 Subject: [PATCH 4/7] update test cases --- genie_registry/maf.py | 2 +- tests/test_maf.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/genie_registry/maf.py b/genie_registry/maf.py index f54b1353..03d5eb63 100644 --- a/genie_registry/maf.py +++ b/genie_registry/maf.py @@ -36,7 +36,7 @@ def _check_tsa1_tsa2(df): ): error = ( "maf: Contains instances where values in REFERENCE_ALLELE match values in TUMOR_SEQ_ALLELE2. " - "This is invalid. Please correct." + "This is invalid. Please correct.\n" ) row_index = df.query("REFERENCE_ALLELE == TUMOR_SEQ_ALLELE2").index.values return error diff --git a/tests/test_maf.py b/tests/test_maf.py index 6cdd0f8a..5900e63c 100644 --- a/tests/test_maf.py +++ b/tests/test_maf.py @@ -258,6 +258,7 @@ def test_invalid__check_tsa1_tsa2(): "REFERENCE_ALLELE or all values in TUMOR_SEQ_ALLELE2.\n" ) + def test_invalid__check_ref_tsa2(): """Test the scenario in which maf file has identical REF and tsa2 and fails""" df = pd.DataFrame( @@ -269,10 +270,11 @@ def test_invalid__check_ref_tsa2(): ) error = genie_registry.maf._check_tsa1_tsa2(df) assert error == ( - "REFERENCE_ALLELE should not equal to TUMOR_SEQ_ALLELE2. " - "Please check row: 1.\n" + "maf: Contains instances where values in REFERENCE_ALLELE match values in TUMOR_SEQ_ALLELE2. " + "This is invalid. Please correct.\n" ) + def test_invalid__check_ref_tsa1_tsa2(): """Test the scenario in which maf file has TSA1 and TSA2 and fails""" df = pd.DataFrame( @@ -292,6 +294,7 @@ def test_invalid__check_ref_tsa1_tsa2(): "Please check row: 1.\n" ) + @pytest.mark.parametrize( "df", [ From c678cc39c4f22f1dff5923cbf8be95bde05be207 Mon Sep 17 00:00:00 2001 From: Dan Lu Date: Tue, 12 Mar 2024 16:23:09 -0700 Subject: [PATCH 5/7] add test cases --- tests/test_maf.py | 133 ++++++++++++++++++++++++++++------------------ 1 file changed, 82 insertions(+), 51 deletions(-) diff --git a/tests/test_maf.py b/tests/test_maf.py index 5900e63c..97a19c0f 100644 --- a/tests/test_maf.py +++ b/tests/test_maf.py @@ -241,40 +241,6 @@ def test_error__check_allele_col(): assert warning == "" -def test_invalid__check_tsa1_tsa2(): - """Test the scenario in which maf file has TSA1 and TSA2 and fails""" - df = pd.DataFrame( - dict( - REFERENCE_ALLELE=["A", "A", "A"], - TUMOR_SEQ_ALLELE1=["B", "B", "B"], - TUMOR_SEQ_ALLELE2=["C", "C", "C"], - ) - ) - error = genie_registry.maf._check_tsa1_tsa2(df) - assert error == ( - "maf: Contains both " - "TUMOR_SEQ_ALLELE1 and TUMOR_SEQ_ALLELE2 columns. " - "All values in TUMOR_SEQ_ALLELE1 must match all values in " - "REFERENCE_ALLELE or all values in TUMOR_SEQ_ALLELE2.\n" - ) - - -def test_invalid__check_ref_tsa2(): - """Test the scenario in which maf file has identical REF and tsa2 and fails""" - df = pd.DataFrame( - dict( - REFERENCE_ALLELE=["A", "A", "A"], - TUMOR_SEQ_ALLELE1=["A", "A", "A"], - TUMOR_SEQ_ALLELE2=["A", "C", "C"], - ) - ) - error = genie_registry.maf._check_tsa1_tsa2(df) - assert error == ( - "maf: Contains instances where values in REFERENCE_ALLELE match values in TUMOR_SEQ_ALLELE2. " - "This is invalid. Please correct.\n" - ) - - def test_invalid__check_ref_tsa1_tsa2(): """Test the scenario in which maf file has TSA1 and TSA2 and fails""" df = pd.DataFrame( @@ -296,28 +262,93 @@ def test_invalid__check_ref_tsa1_tsa2(): @pytest.mark.parametrize( - "df", + "test_df,expected_error", [ - pd.DataFrame( - dict( - REFERENCE_ALLELE=["A", "A", "A"], - TUMOR_SEQ_ALLELE1=["C", "C", "C"], - TUMOR_SEQ_ALLELE2=["C", "C", "C"], - ) + ( + pd.DataFrame( + dict( + REFERENCE_ALLELE=["A", "A", "A"], + TUMOR_SEQ_ALLELE1=["C", "C", "C"], + TUMOR_SEQ_ALLELE2=["C", "C", "C"], + ) + ), + "", ), - pd.DataFrame( - dict( - REFERENCE_ALLELE=["C", "C", "C"], - TUMOR_SEQ_ALLELE1=["C", "C", "C"], - TUMOR_SEQ_ALLELE2=["A", "A", "A"], - ) + ( + pd.DataFrame( + dict( + REFERENCE_ALLELE=["C", "C", "C"], + TUMOR_SEQ_ALLELE1=["C", "C", "C"], + TUMOR_SEQ_ALLELE2=["A", "A", "A"], + ) + ), + "", + ), + ( + pd.DataFrame( + dict( + REFERENCE_ALLELE=["A", "A", "A"], + TUMOR_SEQ_ALLELE1=["B", "B", "B"], + TUMOR_SEQ_ALLELE2=["C", "C", "C"], + ) + ), + "maf: Contains both " + "TUMOR_SEQ_ALLELE1 and TUMOR_SEQ_ALLELE2 columns. " + "All values in TUMOR_SEQ_ALLELE1 must match all values in " + "REFERENCE_ALLELE or all values in TUMOR_SEQ_ALLELE2.\n", + ), + ( + pd.DataFrame( + dict( + REFERENCE_ALLELE=["A", "A", "A"], + TUMOR_SEQ_ALLELE1=["A", "A", "A"], + TUMOR_SEQ_ALLELE2=["A", "C", "C"], + ) + ), + "maf: Contains instances where values in REFERENCE_ALLELE match values in TUMOR_SEQ_ALLELE2. " + "This is invalid. Please correct.\n", + ), + ( + pd.DataFrame( + dict( + REFERENCE_ALLELE=["A", "A", "A"], + TUMOR_SEQ_ALLELE2=["A", "C", "C"], + ) + ), + "maf: Contains instances where values in REFERENCE_ALLELE match values in TUMOR_SEQ_ALLELE2. " + "This is invalid. Please correct.\n", + ), + ( + pd.DataFrame( + dict( + REFERENCE_ALLELE=["A", "A", "A"], + TUMOR_SEQ_ALLELE2=["C", "C", "C"], + ) + ), + "", ), + ( + pd.DataFrame( + dict( + TUMOR_SEQ_ALLELE1=["C", "C", "C"], + ) + ), + "", + ), + ], + ids=[ + "matching_tsa1_tsa2", + "matching_tsa1_ref", + "invalid_tsa1", + "identical_ref_tsa2", + "identical_ref_tsa2_missing_tsa1", + "valid_ref_tsa2_missing_tsa1", + "missing_tsa2_ref", ], ) -def test_valid__check_tsa1_tsa2(df): - """Test valid TSA1 and TSA2""" - error = genie_registry.maf._check_tsa1_tsa2(df) - assert error == "" +def test__check_tsa1_tsa2(test_df, expected_error): + error = genie_registry.maf._check_tsa1_tsa2(test_df) + assert error == expected_error def test_that__cross_validate_does_not_read_files_if_no_clinical_files(maf_class): From ccea1363c45c5b3ff8afa6657ca660e983a8be3c Mon Sep 17 00:00:00 2001 From: Dan Lu Date: Tue, 12 Mar 2024 19:39:31 -0700 Subject: [PATCH 6/7] clean up test functions --- genie_registry/maf.py | 2 +- tests/test_maf.py | 36 ++++++++++++++++-------------------- 2 files changed, 17 insertions(+), 21 deletions(-) diff --git a/genie_registry/maf.py b/genie_registry/maf.py index 03d5eb63..74cfe657 100644 --- a/genie_registry/maf.py +++ b/genie_registry/maf.py @@ -35,7 +35,7 @@ def _check_tsa1_tsa2(df): and not df.query("REFERENCE_ALLELE == TUMOR_SEQ_ALLELE2").empty ): error = ( - "maf: Contains instances where values in REFERENCE_ALLELE match values in TUMOR_SEQ_ALLELE2. " + f"{error}maf: Contains instances where values in REFERENCE_ALLELE match values in TUMOR_SEQ_ALLELE2. " "This is invalid. Please correct.\n" ) row_index = df.query("REFERENCE_ALLELE == TUMOR_SEQ_ALLELE2").index.values diff --git a/tests/test_maf.py b/tests/test_maf.py index 97a19c0f..b0eb209b 100644 --- a/tests/test_maf.py +++ b/tests/test_maf.py @@ -241,26 +241,6 @@ def test_error__check_allele_col(): assert warning == "" -def test_invalid__check_ref_tsa1_tsa2(): - """Test the scenario in which maf file has TSA1 and TSA2 and fails""" - df = pd.DataFrame( - dict( - REFERENCE_ALLELE=["A", "A", "A"], - TUMOR_SEQ_ALLELE1=["B", "B", "B"], - TUMOR_SEQ_ALLELE2=["A", "C", "C"], - ) - ) - error = genie_registry.maf._check_tsa1_tsa2(df) - assert error == ( - "maf: Contains both " - "TUMOR_SEQ_ALLELE1 and TUMOR_SEQ_ALLELE2 columns. " - "All values in TUMOR_SEQ_ALLELE1 must match all values in " - "REFERENCE_ALLELE or all values in TUMOR_SEQ_ALLELE2.\n" - "REFERENCE_ALLELE should not equal to TUMOR_SEQ_ALLELE2. " - "Please check row: 1.\n" - ) - - @pytest.mark.parametrize( "test_df,expected_error", [ @@ -335,6 +315,21 @@ def test_invalid__check_ref_tsa1_tsa2(): ), "", ), + ( + pd.DataFrame( + dict( + REFERENCE_ALLELE=["A", "A", "A"], + TUMOR_SEQ_ALLELE1=["B", "B", "B"], + TUMOR_SEQ_ALLELE2=["A", "C", "C"], + ) + ), + "maf: Contains both " + "TUMOR_SEQ_ALLELE1 and TUMOR_SEQ_ALLELE2 columns. " + "All values in TUMOR_SEQ_ALLELE1 must match all values in " + "REFERENCE_ALLELE or all values in TUMOR_SEQ_ALLELE2.\n" + "maf: Contains instances where values in REFERENCE_ALLELE match values in TUMOR_SEQ_ALLELE2. " + "This is invalid. Please correct.\n", + ), ], ids=[ "matching_tsa1_tsa2", @@ -344,6 +339,7 @@ def test_invalid__check_ref_tsa1_tsa2(): "identical_ref_tsa2_missing_tsa1", "valid_ref_tsa2_missing_tsa1", "missing_tsa2_ref", + "invalid_tsa1_identical_ref_tsa2", ], ) def test__check_tsa1_tsa2(test_df, expected_error): From 672a6950e51af9da9a4046496cf740d390ef538f Mon Sep 17 00:00:00 2001 From: Dan Lu Date: Wed, 13 Mar 2024 17:09:07 -0700 Subject: [PATCH 7/7] update _check_tsa1_tsa2 function name and add one test case --- genie_registry/maf.py | 4 ++-- tests/test_maf.py | 21 ++++++++++++++++++--- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/genie_registry/maf.py b/genie_registry/maf.py index 74cfe657..3a97b97f 100644 --- a/genie_registry/maf.py +++ b/genie_registry/maf.py @@ -11,7 +11,7 @@ logger = logging.getLogger(__name__) -def _check_tsa1_tsa2(df): +def _check_allele_col_validity(df): """If maf file has both TSA1 and TSA2, TSA1 must equal REF, or TSA1 must equal TSA2, and REF must not equal TSA2 """ @@ -270,7 +270,7 @@ def _validate(self, mutationDF): # "start with 'chr' or any 'WT' values.\n" # ) - error = _check_tsa1_tsa2(mutationDF) + error = _check_allele_col_validity(mutationDF) total_error.write(error) if process_functions.checkColExist(mutationDF, "TUMOR_SAMPLE_BARCODE"): diff --git a/tests/test_maf.py b/tests/test_maf.py index b0eb209b..1bb0a28c 100644 --- a/tests/test_maf.py +++ b/tests/test_maf.py @@ -1,3 +1,4 @@ +from cmath import nan from unittest.mock import mock_open, patch import pandas as pd @@ -196,7 +197,7 @@ def test_invalid_validation(maf_class): ) with patch.object( - genie_registry.maf, "_check_tsa1_tsa2", return_value="" + genie_registry.maf, "_check_allele_col_validity", return_value="" ) as check_tsa1_tsa2: error, warning = maf_class._validate(mafDf) check_tsa1_tsa2.assert_called_once_with(mafDf) @@ -330,6 +331,19 @@ def test_error__check_allele_col(): "maf: Contains instances where values in REFERENCE_ALLELE match values in TUMOR_SEQ_ALLELE2. " "This is invalid. Please correct.\n", ), + ( + pd.DataFrame( + dict( + REFERENCE_ALLELE=[nan, "A", "A"], + TUMOR_SEQ_ALLELE1=["B", nan, "B"], + TUMOR_SEQ_ALLELE2=[nan, "C", "C"], + ) + ), + "maf: Contains both " + "TUMOR_SEQ_ALLELE1 and TUMOR_SEQ_ALLELE2 columns. " + "All values in TUMOR_SEQ_ALLELE1 must match all values in " + "REFERENCE_ALLELE or all values in TUMOR_SEQ_ALLELE2.\n", + ), ], ids=[ "matching_tsa1_tsa2", @@ -340,10 +354,11 @@ def test_error__check_allele_col(): "valid_ref_tsa2_missing_tsa1", "missing_tsa2_ref", "invalid_tsa1_identical_ref_tsa2", + "NAs_in_allele_cole", ], ) -def test__check_tsa1_tsa2(test_df, expected_error): - error = genie_registry.maf._check_tsa1_tsa2(test_df) +def test__check_allele_col_validity(test_df, expected_error): + error = genie_registry.maf._check_allele_col_validity(test_df) assert error == expected_error