-
Notifications
You must be signed in to change notification settings - Fork 9
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[GEN-809] Validate allele columns #539
Changes from 4 commits
9776d12
4df61fd
14ab7dd
370bc3b
e84124a
4fea695
780ee66
da74efc
4d3222d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
#!/usr/bin/env python3 | ||
import re | ||
import logging | ||
from typing import Dict, List, Optional | ||
|
||
|
@@ -415,3 +416,61 @@ def standardize_string_for_validation( | |
return standardized_str | ||
else: | ||
return input_string | ||
|
||
|
||
def get_invalid_allele_rows( | ||
input_data: pd.DataFrame, | ||
input_col: str, | ||
allowed_alleles: list, | ||
ignore_case: bool = False, | ||
) -> pd.Index: | ||
""" | ||
Find invalid indices in a DataFrame column based on allowed allele values. | ||
|
||
Args: | ||
input_data (pd.DataFrame): The DataFrame to search. | ||
input_col (str): The name of the column to check. | ||
allowed_alleles (list): The list of allowed allele values. | ||
ignore_case (bool, optional): whether to perform case-insensitive matching | ||
|
||
Returns: | ||
pd.Index: A pandas index object indicating the row indices that | ||
don't match the allowed alleles | ||
""" | ||
search_str = rf"^[{''.join(allowed_alleles)}]+$" | ||
if ignore_case: | ||
flags = re.IGNORECASE | ||
else: | ||
flags = 0 # no flags | ||
# NAs should not be considered as a match | ||
matching_indices = input_data[input_col].str.match( | ||
search_str, flags=flags, na=False | ||
) | ||
invalid_indices = input_data[~matching_indices].index | ||
return invalid_indices | ||
|
||
|
||
def get_allele_validation_message( | ||
invalid_indices: pd.Series, invalid_col: str, allowed_alleles: list, fileformat: str | ||
) -> tuple: | ||
"""Creates the error/warning message for the check for invalid alleles | ||
|
||
Args: | ||
invalid_indices (pd.Series): the row indices that | ||
have invalid alleles | ||
invalid_col (str): The column with the invalid values | ||
allowed_alleles (list): The list of allowed allele values. | ||
fileformat (str): Name of the fileformat | ||
|
||
Returns: | ||
tuple: The errors and warnings from the allele validation | ||
Defaults to blank strings | ||
""" | ||
errors = "" | ||
warnings = "" | ||
if len(invalid_indices) > 0: | ||
errors = ( | ||
f"{fileformat}: Your {invalid_col} column has invalid allele values. " | ||
f"These are the accepted allele values: {allowed_alleles}.\n" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The alleles should be combinations of the allowed alleles. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, message will be updated based on discussions earlier. |
||
) | ||
return errors, warnings |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -70,6 +70,8 @@ class maf(FileTypeFormat): | |
_fileType = "maf" | ||
|
||
_process_kwargs = [] | ||
_allele_cols = ["REFERENCE_ALLELE", "TUMOR_SEQ_ALLELE1", "TUMOR_SEQ_ALLELE2"] | ||
_allowed_alleles = ["A", "T", "C", "G", "N", " ", "-"] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would remove " " and "-". I like what you're thinking here for class attributes. In the future, these classes should follow SOLID principles, right now it is doing too much. I can imagine having a For example, I can imagine having a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes agreed, I originally had them defined in the |
||
|
||
def _validateFilename(self, filePath): | ||
""" | ||
|
@@ -294,6 +296,24 @@ def _validate(self, mutationDF): | |
) | ||
total_error.write(errors) | ||
warning.write(warnings) | ||
|
||
for allele_col in self._allele_cols: | ||
if process_functions.checkColExist(mutationDF, allele_col): | ||
invalid_indices = validate.get_invalid_allele_rows( | ||
mutationDF, | ||
allele_col, | ||
allowed_alleles=self._allowed_alleles, | ||
ignore_case=True, | ||
) | ||
errors, warnings = validate.get_allele_validation_message( | ||
invalid_indices, | ||
invalid_col=allele_col, | ||
allowed_alleles=self._allowed_alleles, | ||
fileformat=self._fileType, | ||
) | ||
total_error.write(errors) | ||
warning.write(warnings) | ||
|
||
return total_error.getvalue(), warning.getvalue() | ||
|
||
def _cross_validate(self, mutationDF: pd.DataFrame) -> tuple: | ||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -768,3 +768,94 @@ def test_that_standardize_string_for_validation_returns_expected( | |||||
allow_underscore=allow_underscore, | ||||||
) | ||||||
assert test_str == expected | ||||||
|
||||||
|
||||||
@pytest.mark.parametrize( | ||||||
"input,expected_index,allowed_alleles,ignore_case", | ||||||
[ | ||||||
( | ||||||
pd.DataFrame( | ||||||
{"REFERENCE_ALLELE": ["ACGT-G", "A-CGT ", "A", "C", "T", "G", "-", " "]} | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
lets make the white space invalid. Lets try and rely on pandas as much as possible, if pandas reads in the " " as float('nan') then it's fine to include it here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, thankfully |
||||||
), | ||||||
pd.Index([]), | ||||||
["A", "T", "C", "G", " ", "-"], | ||||||
True, | ||||||
), | ||||||
( | ||||||
pd.DataFrame({"REFERENCE_ALLELE": ["acgt-g", "acgt", " "]}), | ||||||
pd.Index([]), | ||||||
["A", "T", "C", "G", " ", "-"], | ||||||
True, | ||||||
), | ||||||
( | ||||||
pd.DataFrame({"REFERENCE_ALLELE": ["@##", "ACGTX"]}), | ||||||
pd.Index([0, 1]), | ||||||
["A", "T", "C", "G", " ", "-"], | ||||||
True, | ||||||
), | ||||||
( | ||||||
pd.DataFrame({"REFERENCE_ALLELE": ["XXX", "ACGT"]}), | ||||||
pd.Index([0]), | ||||||
["A", "T", "C", "G", " ", "-"], | ||||||
True, | ||||||
), | ||||||
( | ||||||
pd.DataFrame({"REFERENCE_ALLELE": ["ACGT-G", pd.NA, None]}), | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. According to this code, it seems like it could be possible for allele columns to be read in as numeric and get something like This makes me wonder if we should have standardization of columns and their datatypes before hand (either they have the wrong datatype and stop validation there, or they have the wrong datatype, and it gets converted before further validation) otherwise this just adds a lot more responsibilities for the validations that are string column specific as they will error out with numeric columns. Otherwise, will just have to add another if statement logic to check that the column is string or convert it to str because we won't be able to do our regex matching at all |
||||||
pd.Index([1, 2]), | ||||||
["A", "T", "C", "G", " ", "-"], | ||||||
True, | ||||||
), | ||||||
( | ||||||
pd.DataFrame({"REFERENCE_ALLELE": ["acgt-G"]}), | ||||||
pd.Index([0]), | ||||||
["A", "T", "C", "G", " ", "-"], | ||||||
False, | ||||||
), | ||||||
], | ||||||
ids=[ | ||||||
"correct_alleles", | ||||||
"correct_alleles_case", | ||||||
"invalid_special_chars", | ||||||
"invalid_chars", | ||||||
"missing_entries", | ||||||
"case_not_ignored", | ||||||
], | ||||||
) | ||||||
def test_that_get_invalid_allele_rows_returns_expected( | ||||||
input, expected_index, allowed_alleles, ignore_case | ||||||
): | ||||||
invalid_rows = validate.get_invalid_allele_rows( | ||||||
input, | ||||||
input_col="REFERENCE_ALLELE", | ||||||
allowed_alleles=allowed_alleles, | ||||||
ignore_case=ignore_case, | ||||||
) | ||||||
assert invalid_rows.equals(expected_index) | ||||||
|
||||||
|
||||||
@pytest.mark.parametrize( | ||||||
"input_invalid_rows,expected_error,expected_warning", | ||||||
[ | ||||||
( | ||||||
pd.Index([1, 2, 3]), | ||||||
( | ||||||
"maf: Your REFERENCE_ALLELE column has invalid allele values. " | ||||||
"These are the accepted allele values: ['A', 'C', 'T', 'G', ' ', '-'].\n" | ||||||
), | ||||||
"", | ||||||
), | ||||||
([], "", ""), | ||||||
], | ||||||
ids=["has_invalid_alleles", "has_no_invalid_alleles"], | ||||||
) | ||||||
def test_that_get_allele_validation_message_returns_expected( | ||||||
input_invalid_rows, expected_error, expected_warning | ||||||
): | ||||||
error, warning = validate.get_allele_validation_message( | ||||||
input_invalid_rows, | ||||||
invalid_col="REFERENCE_ALLELE", | ||||||
allowed_alleles=["A", "C", "T", "G", " ", "-"], | ||||||
fileformat="maf", | ||||||
) | ||||||
assert error == expected_error | ||||||
assert warning == expected_warning |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this regex does actually check if for alleles like 'ATATATATAT'
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It does. Have to update it for '-', empty strings and NAs based on our discussion