Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GEN-809] Validate allele columns #539

Merged
merged 9 commits into from
Nov 9, 2023
59 changes: 59 additions & 0 deletions genie/validate.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/usr/bin/env python3
import re
import logging
from typing import Dict, List, Optional

Expand Down Expand Up @@ -415,3 +416,61 @@ def standardize_string_for_validation(
return standardized_str
else:
return input_string


def get_invalid_allele_rows(
input_data: pd.DataFrame,
input_col: str,
allowed_alleles: list,
ignore_case: bool = False,
) -> pd.Index:
"""
Find invalid indices in a DataFrame column based on allowed allele values.

Args:
input_data (pd.DataFrame): The DataFrame to search.
input_col (str): The name of the column to check.
allowed_alleles (list): The list of allowed allele values.
ignore_case (bool, optional): whether to perform case-insensitive matching

Returns:
pd.Index: A pandas index object indicating the row indices that
don't match the allowed alleles
"""
search_str = rf"^[{''.join(allowed_alleles)}]+$"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this regex does actually check if for alleles like 'ATATATATAT'

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It does. Have to update it for '-', empty strings and NAs based on our discussion

if ignore_case:
flags = re.IGNORECASE
else:
flags = 0 # no flags
# NAs should not be considered as a match
matching_indices = input_data[input_col].str.match(
search_str, flags=flags, na=False
)
invalid_indices = input_data[~matching_indices].index
return invalid_indices


def get_allele_validation_message(
invalid_indices: pd.Series, invalid_col: str, allowed_alleles: list, fileformat: str
) -> tuple:
"""Creates the error/warning message for the check for invalid alleles

Args:
invalid_indices (pd.Series): the row indices that
have invalid alleles
invalid_col (str): The column with the invalid values
allowed_alleles (list): The list of allowed allele values.
fileformat (str): Name of the fileformat

Returns:
tuple: The errors and warnings from the allele validation
Defaults to blank strings
"""
errors = ""
warnings = ""
if len(invalid_indices) > 0:
errors = (
f"{fileformat}: Your {invalid_col} column has invalid allele values. "
f"These are the accepted allele values: {allowed_alleles}.\n"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The alleles should be combinations of the allowed alleles.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, message will be updated based on discussions earlier.

)
return errors, warnings
20 changes: 20 additions & 0 deletions genie_registry/maf.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ class maf(FileTypeFormat):
_fileType = "maf"

_process_kwargs = []
_allele_cols = ["REFERENCE_ALLELE", "TUMOR_SEQ_ALLELE1", "TUMOR_SEQ_ALLELE2"]
_allowed_alleles = ["A", "T", "C", "G", "N", " ", "-"]
Copy link
Member

@thomasyu888 thomasyu888 Nov 3, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would remove " " and "-".

I like what you're thinking here for class attributes. In the future, these classes should follow SOLID principles, right now it is doing too much. I can imagine having a Validate class that ValidateMaf inherits from and then different compositions if necessary.

For example, I can imagine having a _required_columns attribute.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes agreed, I originally had them defined in the _validate function and was debating between the two but was thinking future wise when we do want to start separating out into the separate classes we talked about (e.g Processing class, Validate class and regular read in data class), thought it would be easier if we put constants like these up in the class attributes given how big _validate is getting. Even before then, we might have other allele variable specific validation to add.


def _validateFilename(self, filePath):
"""
Expand Down Expand Up @@ -294,6 +296,24 @@ def _validate(self, mutationDF):
)
total_error.write(errors)
warning.write(warnings)

for allele_col in self._allele_cols:
if process_functions.checkColExist(mutationDF, allele_col):
invalid_indices = validate.get_invalid_allele_rows(
mutationDF,
allele_col,
allowed_alleles=self._allowed_alleles,
ignore_case=True,
)
errors, warnings = validate.get_allele_validation_message(
invalid_indices,
invalid_col=allele_col,
allowed_alleles=self._allowed_alleles,
fileformat=self._fileType,
)
total_error.write(errors)
warning.write(warnings)

return total_error.getvalue(), warning.getvalue()

def _cross_validate(self, mutationDF: pd.DataFrame) -> tuple:
Expand Down
18 changes: 18 additions & 0 deletions genie_registry/vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ class vcf(FileTypeFormat):
_fileType = "vcf"

_process_kwargs = []
_allele_col = "REF"
thomasyu888 marked this conversation as resolved.
Show resolved Hide resolved
_allowed_alleles = ["A", "T", "C", "G", "N"]

def _validateFilename(self, filePath):
basename = os.path.basename(filePath[0])
Expand Down Expand Up @@ -137,6 +139,22 @@ def _validate(self, vcfdf):
total_error += error
warning += warn

if process_functions.checkColExist(vcfdf, self._allele_col):
invalid_indices = validate.get_invalid_allele_rows(
vcfdf,
self._allele_col,
allowed_alleles=self._allowed_alleles,
ignore_case=True,
)
errors, warnings = validate.get_allele_validation_message(
invalid_indices,
invalid_col=self._allele_col,
allowed_alleles=self._allowed_alleles,
fileformat=self._fileType,
)
total_error += errors
warning += warnings

# No white spaces
white_space = vcfdf.apply(lambda x: contains_whitespace(x), axis=1)
if sum(white_space) > 0:
Expand Down
8 changes: 8 additions & 0 deletions tests/test_maf.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@ def test_firstcolumn_validation(maf_class):
"maf: First column header must be "
"one of these: CHROMOSOME, HUGO_SYMBOL, "
"TUMOR_SAMPLE_BARCODE.\n"
"maf: Your REFERENCE_ALLELE column has invalid allele values. "
"These are the accepted allele values: ['A', 'T', 'C', 'G', 'N', ' ', '-'].\n"
)
assert error == expectedErrors
assert warning == ""
Expand Down Expand Up @@ -147,6 +149,10 @@ def test_errors_validation(maf_class):
"This column must only be these values: 1, 2, 3, 4, 5, 6, 7, 8, 9, "
"10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, X, Y, MT\n"
"maf: TUMOR_SAMPLE_BARCODE must start with GENIE-SAGE\n"
"maf: Your REFERENCE_ALLELE column has invalid allele values. "
"These are the accepted allele values: ['A', 'T', 'C', 'G', 'N', ' ', '-'].\n"
"maf: Your TUMOR_SEQ_ALLELE2 column has invalid allele values. "
"These are the accepted allele values: ['A', 'T', 'C', 'G', 'N', ' ', '-'].\n"
)
expectedWarnings = (
"maf: "
Expand Down Expand Up @@ -195,6 +201,8 @@ def test_invalid_validation(maf_class):
"maf: "
"TUMOR_SEQ_ALLELE2 can't have any blank or null values.\n"
"maf: TUMOR_SAMPLE_BARCODE must start with GENIE-SAGE\n"
"maf: Your TUMOR_SEQ_ALLELE2 column has invalid allele values. "
"These are the accepted allele values: ['A', 'T', 'C', 'G', 'N', ' ', '-'].\n"
)
expectedWarnings = (
"maf: TUMOR_SEQ_ALLELE2 column contains 'NA' values, "
Expand Down
91 changes: 91 additions & 0 deletions tests/test_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -768,3 +768,94 @@ def test_that_standardize_string_for_validation_returns_expected(
allow_underscore=allow_underscore,
)
assert test_str == expected


@pytest.mark.parametrize(
"input,expected_index,allowed_alleles,ignore_case",
[
(
pd.DataFrame(
{"REFERENCE_ALLELE": ["ACGT-G", "A-CGT ", "A", "C", "T", "G", "-", " "]}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
{"REFERENCE_ALLELE": ["ACGT-G", "A-CGT ", "A", "C", "T", "G", "-", " "]}
{"REFERENCE_ALLELE": ["ACGTG", "ACGT ", "A", "C", "T", "G", "-", " "]}

lets make the white space invalid. Lets try and rely on pandas as much as possible, if pandas reads in the " " as float('nan') then it's fine to include it here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, thankfully pandas.str.match has a parameter that handles anything NA so we should be good on that front and don't need to add the long list of possible NA values here.

),
pd.Index([]),
["A", "T", "C", "G", " ", "-"],
True,
),
(
pd.DataFrame({"REFERENCE_ALLELE": ["acgt-g", "acgt", " "]}),
pd.Index([]),
["A", "T", "C", "G", " ", "-"],
True,
),
(
pd.DataFrame({"REFERENCE_ALLELE": ["@##", "ACGTX"]}),
pd.Index([0, 1]),
["A", "T", "C", "G", " ", "-"],
True,
),
(
pd.DataFrame({"REFERENCE_ALLELE": ["XXX", "ACGT"]}),
pd.Index([0]),
["A", "T", "C", "G", " ", "-"],
True,
),
(
pd.DataFrame({"REFERENCE_ALLELE": ["ACGT-G", pd.NA, None]}),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are float('nan') currently allowed in the maf?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

According to this code, it seems like it could be possible for allele columns to be read in as numeric and get something like float('nan') (i believe this is only for numeric columns).

This makes me wonder if we should have standardization of columns and their datatypes before hand (either they have the wrong datatype and stop validation there, or they have the wrong datatype, and it gets converted before further validation) otherwise this just adds a lot more responsibilities for the validations that are string column specific as they will error out with numeric columns.

Otherwise, will just have to add another if statement logic to check that the column is string or convert it to str because we won't be able to do our regex matching at all

pd.Index([1, 2]),
["A", "T", "C", "G", " ", "-"],
True,
),
(
pd.DataFrame({"REFERENCE_ALLELE": ["acgt-G"]}),
pd.Index([0]),
["A", "T", "C", "G", " ", "-"],
False,
),
],
ids=[
"correct_alleles",
"correct_alleles_case",
"invalid_special_chars",
"invalid_chars",
"missing_entries",
"case_not_ignored",
],
)
def test_that_get_invalid_allele_rows_returns_expected(
input, expected_index, allowed_alleles, ignore_case
):
invalid_rows = validate.get_invalid_allele_rows(
input,
input_col="REFERENCE_ALLELE",
allowed_alleles=allowed_alleles,
ignore_case=ignore_case,
)
assert invalid_rows.equals(expected_index)


@pytest.mark.parametrize(
"input_invalid_rows,expected_error,expected_warning",
[
(
pd.Index([1, 2, 3]),
(
"maf: Your REFERENCE_ALLELE column has invalid allele values. "
"These are the accepted allele values: ['A', 'C', 'T', 'G', ' ', '-'].\n"
),
"",
),
([], "", ""),
],
ids=["has_invalid_alleles", "has_no_invalid_alleles"],
)
def test_that_get_allele_validation_message_returns_expected(
input_invalid_rows, expected_error, expected_warning
):
error, warning = validate.get_allele_validation_message(
input_invalid_rows,
invalid_col="REFERENCE_ALLELE",
allowed_alleles=["A", "C", "T", "G", " ", "-"],
fileformat="maf",
)
assert error == expected_error
assert warning == expected_warning
20 changes: 11 additions & 9 deletions tests/test_vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def test_validation_valid_no_samples(vcf_class):
"#CHROM": ["2", "9", "12"],
"POS": [69688533, 99401860, 53701241],
"ID": ["AAK1", "AAED1", "AAAS"],
"REF": ["AAK1", "AAED1", "AAAS"],
"REF": ["AANT", "AACG", "AAAN"],
"ALT": ["AAK1", "AAED1", "AAAS"],
"QUAL": ["AAK1", "AAED1", "AAAS"],
"FILTER": ["AAK1", "AAED1", "AAAS"],
Expand All @@ -47,7 +47,7 @@ def test_validation_valid_one_sample_tumor(vcf_class):
"#CHROM": ["2", "9", "12"],
"POS": [69688533, 99401860, 53701241],
"ID": ["AAK1", "AAED1", "AAAS"],
"REF": ["AAK1", "AAED1", "AAAS"],
"REF": ["AANT", "AACG", "AAAN"],
"ALT": ["AAK1", "AAED1", "AAAS"],
"QUAL": ["AAK1", "AAED1", "AAAS"],
"FILTER": ["AAK1", "AAED1", "AAAS"],
Expand All @@ -67,7 +67,7 @@ def test_validation_valid_one_sample(vcf_class):
"#CHROM": ["2", "9", "12"],
"POS": [69688533, 99401860, 53701241],
"ID": ["AAK1", "AAED1", "AAAS"],
"REF": ["AAK1", "AAED1", "AAAS"],
"REF": ["AANT", "AACG", "AAAN"],
"ALT": ["AAK1", "AAED1", "AAAS"],
"QUAL": ["AAK1", "AAED1", "AAAS"],
"FILTER": ["AAK1", "AAED1", "AAAS"],
Expand All @@ -88,7 +88,7 @@ def test_validation_missing_format_col(vcf_class):
"#CHROM": ["2", "9", "12"],
"POS": [69688533, 99401860, 53701241],
"ID": ["AAK1", "AAED1", "AAAS"],
"REF": ["AAK1", "AAED1", "AAAS"],
"REF": ["AANT", "AACG", "AAAN"],
"ALT": ["AAK1", "AAED1", "AAAS"],
"QUAL": ["AAK1", "AAED1", "AAAS"],
"FILTER": ["AAK1", "AAED1", "AAAS"],
Expand All @@ -107,7 +107,7 @@ def test_validation_invalid_one_sample(vcf_class):
"#CHROM": ["2", "9", "12"],
"POS": [69688533, 99401860, 53701241],
"ID": ["AAK1", "AAED1", "AAAS"],
"REF": ["AAK1", "AAED1", "AAAS"],
"REF": ["AANT", "AACG", "AAAN"],
"ALT": ["AAK1", "AAED1", "AAAS"],
"QUAL": ["AAK1", "AAED1", "AAAS"],
"FILTER": ["AAK1", "AAED1", "AAAS"],
Expand All @@ -130,7 +130,7 @@ def test_validation_valid_two_samples(vcf_class):
"#CHROM": ["2", "9", "12"],
"POS": [69688533, 99401860, 53701241],
"ID": ["AAK1", "AAED1", "AAAS"],
"REF": ["AAK1", "AAED1", "AAAS"],
"REF": ["AANT", "AACG", "AAAN"],
"ALT": ["AAK1", "AAED1", "AAAS"],
"QUAL": ["AAK1", "AAED1", "AAAS"],
"FILTER": ["AAK1", "AAED1", "AAAS"],
Expand All @@ -151,7 +151,7 @@ def test_validation_invalid_two_samples_tumor(vcf_class):
"#CHROM": ["2", "9", "12"],
"POS": [69688533, 99401860, 53701241],
"ID": ["AAK1", "AAED1", "AAAS"],
"REF": ["AAK1", "AAED1", "AAAS"],
"REF": ["AANT", "AACG", "AAAN"],
"ALT": ["AAK1", "AAED1", "AAAS"],
"QUAL": ["AAK1", "AAED1", "AAAS"],
"FILTER": ["AAK1", "AAED1", "AAAS"],
Expand All @@ -172,7 +172,7 @@ def test_validation_invalid_two_samples_normal(vcf_class):
"#CHROM": ["2", "9", "12"],
"POS": [69688533, 99401860, 53701241],
"ID": ["AAK1", "AAED1", "AAAS"],
"REF": ["AAK1", "AAED1", "AAAS"],
"REF": ["AANT", "AACG", "AAAN"],
"ALT": ["AAK1", "AAED1", "AAAS"],
"QUAL": ["AAK1", "AAED1", "AAAS"],
"FILTER": ["AAK1", "AAED1", "AAAS"],
Expand All @@ -193,7 +193,7 @@ def test_validation_invalid_white_space(vcf_class):
"#CHROMM": ["2", "9", "12"],
"POS": [69688533, 99401860, 53701241],
"ID": ["AAK1", "AAED1", "AAAS"],
"REF": ["AAK1", "AAED1", "AAAS"],
"REF": ["AANT", "AACG", "AAAN"],
"ALT": ["AAK1", "AAED1", "AAAS"],
"QUAL": ["AAK1", "AAED1", "AAAS"],
"FILTER": ["AAK1", "AA ED1", "AAAS"],
Expand Down Expand Up @@ -231,6 +231,8 @@ def test_validation_invalid_content(vcf_class):
"space delimited instead of tab delimited.\n"
"vcf: Please double check your #CHROM column. This column must only be these values: "
"1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, X, Y, MT\n"
"vcf: Your REF column has invalid allele values. "
"These are the accepted allele values: ['A', 'T', 'C', 'G', 'N'].\n"
)
expectedWarning = "vcf: Should not have the chr prefix in front of chromosomes.\n"
assert error == expectedError
Expand Down