Sage-Bionetworks · rxu17 · Nov 9, 2023 · Nov 3, 2023 · Nov 3, 2023 · Nov 3, 2023
@@ -1,4 +1,5 @@
 #!/usr/bin/env python3
+import re
 import logging
 from typing import Dict, List, Optional
 
@@ -415,3 +416,61 @@ def standardize_string_for_validation(
         return standardized_str
     else:
         return input_string
+
+
+def get_invalid_allele_rows(
+    input_data: pd.DataFrame,
+    input_col: str,
+    allowed_alleles: list,
+    ignore_case: bool = False,
+) -> pd.Index:
+    """
+    Find invalid indices in a DataFrame column based on allowed allele values.
+
+    Args:
+        input_data (pd.DataFrame): The DataFrame to search.
+        input_col (str): The name of the column to check.
+        allowed_alleles (list): The list of allowed allele values.
+        ignore_case (bool, optional): whether to perform case-insensitive matching
+
+    Returns:
+        pd.Index: A pandas index object indicating the row indices that
+        don't match the allowed alleles
+    """
+    search_str = rf"^[{''.join(allowed_alleles)}]+$"
+    if ignore_case:
+        flags = re.IGNORECASE
+    else:
+        flags = 0  # no flags
+    # NAs should not be considered as a match
+    matching_indices = input_data[input_col].str.match(
+        search_str, flags=flags, na=False
+    )
+    invalid_indices = input_data[~matching_indices].index
+    return invalid_indices
+
+
+def get_allele_validation_message(
+    invalid_indices: pd.Series, invalid_col: str, allowed_alleles: list, fileformat: str
+) -> tuple:
+    """Creates the error/warning message for the check for invalid alleles
+
+    Args:
+        invalid_indices (pd.Series): the row indices that
+            have invalid alleles
+        invalid_col (str): The column with the invalid values
+        allowed_alleles (list): The list of allowed allele values.
+        fileformat (str): Name of the fileformat
+
+    Returns:
+        tuple: The errors and warnings from the allele validation
+               Defaults to blank strings
+    """
+    errors = ""
+    warnings = ""
+    if len(invalid_indices) > 0:
+        errors = (
+            f"{fileformat}: Your {invalid_col} column has invalid allele values. "
+            f"These are the accepted allele values: {allowed_alleles}.\n"
+        )
+    return errors, warnings
@@ -70,6 +70,8 @@ class maf(FileTypeFormat):
     _fileType = "maf"
 
     _process_kwargs = []
+    _allele_cols = ["REFERENCE_ALLELE", "TUMOR_SEQ_ALLELE1", "TUMOR_SEQ_ALLELE2"]
+    _allowed_alleles = ["A", "T", "C", "G", "N", " ", "-"]
 
     def _validateFilename(self, filePath):
         """
@@ -294,6 +296,24 @@ def _validate(self, mutationDF):
             )
             total_error.write(errors)
             warning.write(warnings)
+
+        for allele_col in self._allele_cols:
+            if process_functions.checkColExist(mutationDF, allele_col):
+                invalid_indices = validate.get_invalid_allele_rows(
+                    mutationDF,
+                    allele_col,
+                    allowed_alleles=self._allowed_alleles,
+                    ignore_case=True,
+                )
+                errors, warnings = validate.get_allele_validation_message(
+                    invalid_indices,
+                    invalid_col=allele_col,
+                    allowed_alleles=self._allowed_alleles,
+                    fileformat=self._fileType,
+                )
+                total_error.write(errors)
+                warning.write(warnings)
+
         return total_error.getvalue(), warning.getvalue()
 
     def _cross_validate(self, mutationDF: pd.DataFrame) -> tuple:

@@ -18,6 +18,8 @@ class vcf(FileTypeFormat):
     _fileType = "vcf"
 
     _process_kwargs = []
+    _allele_col = "REF"
+    _allowed_alleles = ["A", "T", "C", "G", "N"]
 
     def _validateFilename(self, filePath):
         basename = os.path.basename(filePath[0])
@@ -137,6 +139,22 @@ def _validate(self, vcfdf):
         total_error += error
         warning += warn
 
+        if process_functions.checkColExist(vcfdf, self._allele_col):
+            invalid_indices = validate.get_invalid_allele_rows(
+                vcfdf,
+                self._allele_col,
+                allowed_alleles=self._allowed_alleles,
+                ignore_case=True,
+            )
+            errors, warnings = validate.get_allele_validation_message(
+                invalid_indices,
+                invalid_col=self._allele_col,
+                allowed_alleles=self._allowed_alleles,
+                fileformat=self._fileType,
+            )
+            total_error += errors
+            warning += warnings
+
         # No white spaces
         white_space = vcfdf.apply(lambda x: contains_whitespace(x), axis=1)
         if sum(white_space) > 0:

@@ -94,6 +94,8 @@ def test_firstcolumn_validation(maf_class):
         "maf: First column header must be "
         "one of these: CHROMOSOME, HUGO_SYMBOL, "
         "TUMOR_SAMPLE_BARCODE.\n"
+        "maf: Your REFERENCE_ALLELE column has invalid allele values. "
+        "These are the accepted allele values: ['A', 'T', 'C', 'G', 'N', ' ', '-'].\n"
     )
     assert error == expectedErrors
     assert warning == ""
@@ -147,6 +149,10 @@ def test_errors_validation(maf_class):
         "This column must only be these values: 1, 2, 3, 4, 5, 6, 7, 8, 9, "
         "10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, X, Y, MT\n"
         "maf: TUMOR_SAMPLE_BARCODE must start with GENIE-SAGE\n"
+        "maf: Your REFERENCE_ALLELE column has invalid allele values. "
+        "These are the accepted allele values: ['A', 'T', 'C', 'G', 'N', ' ', '-'].\n"
+        "maf: Your TUMOR_SEQ_ALLELE2 column has invalid allele values. "
+        "These are the accepted allele values: ['A', 'T', 'C', 'G', 'N', ' ', '-'].\n"
     )
     expectedWarnings = (
         "maf: "
@@ -195,6 +201,8 @@ def test_invalid_validation(maf_class):
         "maf: "
         "TUMOR_SEQ_ALLELE2 can't have any blank or null values.\n"
         "maf: TUMOR_SAMPLE_BARCODE must start with GENIE-SAGE\n"
+        "maf: Your TUMOR_SEQ_ALLELE2 column has invalid allele values. "
+        "These are the accepted allele values: ['A', 'T', 'C', 'G', 'N', ' ', '-'].\n"
     )
     expectedWarnings = (
         "maf: TUMOR_SEQ_ALLELE2 column contains 'NA' values, "

@@ -768,3 +768,94 @@ def test_that_standardize_string_for_validation_returns_expected(
         allow_underscore=allow_underscore,
     )
     assert test_str == expected
+
+
+@pytest.mark.parametrize(
+    "input,expected_index,allowed_alleles,ignore_case",
+    [
+        (
+            pd.DataFrame(
+                {"REFERENCE_ALLELE": ["ACGT-G", "A-CGT ", "A", "C", "T", "G", "-", " "]}
-                {"REFERENCE_ALLELE": ["ACGT-G", "A-CGT ", "A", "C", "T", "G", "-", " "]}
+                {"REFERENCE_ALLELE": ["ACGTG", "ACGT ", "A", "C", "T", "G", "-", " "]}
-                {"REFERENCE_ALLELE": ["ACGT-G", "A-CGT ", "A", "C", "T", "G", "-", " "]}
+                {"REFERENCE_ALLELE": ["ACGTG", "ACGT ", "A", "C", "T", "G", "-", " "]}
+            ),
+            pd.Index([]),
+            ["A", "T", "C", "G", " ", "-"],
+            True,
+        ),
+        (
+            pd.DataFrame({"REFERENCE_ALLELE": ["acgt-g", "acgt", "  "]}),
+            pd.Index([]),
+            ["A", "T", "C", "G", " ", "-"],
+            True,
+        ),
+        (
+            pd.DataFrame({"REFERENCE_ALLELE": ["@##", "ACGTX"]}),
+            pd.Index([0, 1]),
+            ["A", "T", "C", "G", " ", "-"],
+            True,
+        ),
+        (
+            pd.DataFrame({"REFERENCE_ALLELE": ["XXX", "ACGT"]}),
+            pd.Index([0]),
+            ["A", "T", "C", "G", " ", "-"],
+            True,
+        ),
+        (
+            pd.DataFrame({"REFERENCE_ALLELE": ["ACGT-G", pd.NA, None]}),
+            pd.Index([1, 2]),
+            ["A", "T", "C", "G", " ", "-"],
+            True,
+        ),
+        (
+            pd.DataFrame({"REFERENCE_ALLELE": ["acgt-G"]}),
+            pd.Index([0]),
+            ["A", "T", "C", "G", " ", "-"],
+            False,
+        ),
+    ],
+    ids=[
+        "correct_alleles",
+        "correct_alleles_case",
+        "invalid_special_chars",
+        "invalid_chars",
+        "missing_entries",
+        "case_not_ignored",
+    ],
+)
+def test_that_get_invalid_allele_rows_returns_expected(
+    input, expected_index, allowed_alleles, ignore_case
+):
+    invalid_rows = validate.get_invalid_allele_rows(
+        input,
+        input_col="REFERENCE_ALLELE",
+        allowed_alleles=allowed_alleles,
+        ignore_case=ignore_case,
+    )
+    assert invalid_rows.equals(expected_index)
+
+
+@pytest.mark.parametrize(
+    "input_invalid_rows,expected_error,expected_warning",
+    [
+        (
+            pd.Index([1, 2, 3]),
+            (
+                "maf: Your REFERENCE_ALLELE column has invalid allele values. "
+                "These are the accepted allele values: ['A', 'C', 'T', 'G', ' ', '-'].\n"
+            ),
+            "",
+        ),
+        ([], "", ""),
+    ],
+    ids=["has_invalid_alleles", "has_no_invalid_alleles"],
+)
+def test_that_get_allele_validation_message_returns_expected(
+    input_invalid_rows, expected_error, expected_warning
+):
+    error, warning = validate.get_allele_validation_message(
+        input_invalid_rows,
+        invalid_col="REFERENCE_ALLELE",
+        allowed_alleles=["A", "C", "T", "G", " ", "-"],
+        fileformat="maf",
+    )
+    assert error == expected_error
+    assert warning == expected_warning
@@ -29,7 +29,7 @@ def test_validation_valid_no_samples(vcf_class):
             "#CHROM": ["2", "9", "12"],
             "POS": [69688533, 99401860, 53701241],
             "ID": ["AAK1", "AAED1", "AAAS"],
-            "REF": ["AAK1", "AAED1", "AAAS"],
+            "REF": ["AANT", "AACG", "AAAN"],
             "ALT": ["AAK1", "AAED1", "AAAS"],
             "QUAL": ["AAK1", "AAED1", "AAAS"],
             "FILTER": ["AAK1", "AAED1", "AAAS"],
@@ -47,7 +47,7 @@ def test_validation_valid_one_sample_tumor(vcf_class):
             "#CHROM": ["2", "9", "12"],
             "POS": [69688533, 99401860, 53701241],
             "ID": ["AAK1", "AAED1", "AAAS"],
-            "REF": ["AAK1", "AAED1", "AAAS"],
+            "REF": ["AANT", "AACG", "AAAN"],
             "ALT": ["AAK1", "AAED1", "AAAS"],
             "QUAL": ["AAK1", "AAED1", "AAAS"],
             "FILTER": ["AAK1", "AAED1", "AAAS"],
@@ -67,7 +67,7 @@ def test_validation_valid_one_sample(vcf_class):
             "#CHROM": ["2", "9", "12"],
             "POS": [69688533, 99401860, 53701241],
             "ID": ["AAK1", "AAED1", "AAAS"],
-            "REF": ["AAK1", "AAED1", "AAAS"],
+            "REF": ["AANT", "AACG", "AAAN"],
             "ALT": ["AAK1", "AAED1", "AAAS"],
             "QUAL": ["AAK1", "AAED1", "AAAS"],
             "FILTER": ["AAK1", "AAED1", "AAAS"],
@@ -88,7 +88,7 @@ def test_validation_missing_format_col(vcf_class):
             "#CHROM": ["2", "9", "12"],
             "POS": [69688533, 99401860, 53701241],
             "ID": ["AAK1", "AAED1", "AAAS"],
-            "REF": ["AAK1", "AAED1", "AAAS"],
+            "REF": ["AANT", "AACG", "AAAN"],
             "ALT": ["AAK1", "AAED1", "AAAS"],
             "QUAL": ["AAK1", "AAED1", "AAAS"],
             "FILTER": ["AAK1", "AAED1", "AAAS"],
@@ -107,7 +107,7 @@ def test_validation_invalid_one_sample(vcf_class):
             "#CHROM": ["2", "9", "12"],
             "POS": [69688533, 99401860, 53701241],
             "ID": ["AAK1", "AAED1", "AAAS"],
-            "REF": ["AAK1", "AAED1", "AAAS"],
+            "REF": ["AANT", "AACG", "AAAN"],
             "ALT": ["AAK1", "AAED1", "AAAS"],
             "QUAL": ["AAK1", "AAED1", "AAAS"],
             "FILTER": ["AAK1", "AAED1", "AAAS"],
@@ -130,7 +130,7 @@ def test_validation_valid_two_samples(vcf_class):
             "#CHROM": ["2", "9", "12"],
             "POS": [69688533, 99401860, 53701241],
             "ID": ["AAK1", "AAED1", "AAAS"],
-            "REF": ["AAK1", "AAED1", "AAAS"],
+            "REF": ["AANT", "AACG", "AAAN"],
             "ALT": ["AAK1", "AAED1", "AAAS"],
             "QUAL": ["AAK1", "AAED1", "AAAS"],
             "FILTER": ["AAK1", "AAED1", "AAAS"],
@@ -151,7 +151,7 @@ def test_validation_invalid_two_samples_tumor(vcf_class):
             "#CHROM": ["2", "9", "12"],
             "POS": [69688533, 99401860, 53701241],
             "ID": ["AAK1", "AAED1", "AAAS"],
-            "REF": ["AAK1", "AAED1", "AAAS"],
+            "REF": ["AANT", "AACG", "AAAN"],
             "ALT": ["AAK1", "AAED1", "AAAS"],
             "QUAL": ["AAK1", "AAED1", "AAAS"],
             "FILTER": ["AAK1", "AAED1", "AAAS"],
@@ -172,7 +172,7 @@ def test_validation_invalid_two_samples_normal(vcf_class):
             "#CHROM": ["2", "9", "12"],
             "POS": [69688533, 99401860, 53701241],
             "ID": ["AAK1", "AAED1", "AAAS"],
-            "REF": ["AAK1", "AAED1", "AAAS"],
+            "REF": ["AANT", "AACG", "AAAN"],
             "ALT": ["AAK1", "AAED1", "AAAS"],
             "QUAL": ["AAK1", "AAED1", "AAAS"],
             "FILTER": ["AAK1", "AAED1", "AAAS"],
@@ -193,7 +193,7 @@ def test_validation_invalid_white_space(vcf_class):
             "#CHROMM": ["2", "9", "12"],
             "POS": [69688533, 99401860, 53701241],
             "ID": ["AAK1", "AAED1", "AAAS"],
-            "REF": ["AAK1", "AAED1", "AAAS"],
+            "REF": ["AANT", "AACG", "AAAN"],
             "ALT": ["AAK1", "AAED1", "AAAS"],
             "QUAL": ["AAK1", "AAED1", "AAAS"],
             "FILTER": ["AAK1", "AA ED1", "AAAS"],
@@ -231,6 +231,8 @@ def test_validation_invalid_content(vcf_class):
         "space delimited instead of tab delimited.\n"
         "vcf: Please double check your #CHROM column.  This column must only be these values: "
         "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, X, Y, MT\n"
+        "vcf: Your REF column has invalid allele values. "
+        "These are the accepted allele values: ['A', 'T', 'C', 'G', 'N'].\n"
     )
     expectedWarning = "vcf: Should not have the chr prefix in front of chromosomes.\n"
     assert error == expectedError