From 9776d1269dc2eb71092ed00fb859e0512dfb9678 Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Thu, 2 Nov 2023 17:52:27 -0700 Subject: [PATCH 1/8] add code for allele validation - initial --- genie/validate.py | 59 +++++++++++++++++++++++++++ genie_registry/maf.py | 15 +++++++ genie_registry/vcf.py | 16 ++++++++ tests/test_maf.py | 8 ++++ tests/test_validate.py | 91 ++++++++++++++++++++++++++++++++++++++++++ tests/test_vcf.py | 20 +++++----- 6 files changed, 200 insertions(+), 9 deletions(-) diff --git a/genie/validate.py b/genie/validate.py index 10d8f586..d8bcd9c7 100644 --- a/genie/validate.py +++ b/genie/validate.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 +import re import logging from typing import Dict, List, Optional @@ -415,3 +416,61 @@ def standardize_string_for_validation( return standardized_str else: return input_string + + +def get_invalid_allele_rows( + input_data: pd.DataFrame, + input_col: str, + allowed_alleles: list, + ignore_case: bool = False, +) -> pd.Index: + """ + Find invalid indices in a DataFrame column based on allowed allele values. + + Args: + input_data (pd.DataFrame): The DataFrame to search. + input_col (str): The name of the column to check. + allowed_alleles (list): The list of allowed allele values. + ignore_case (bool, optional): whether to perform case-insensitive matching + + Returns: + pd.Index: A pandas index object indicating the row indices that + don't match the allowed alleles + """ + search_str = rf"^[{''.join(allowed_alleles)}]+$" + if ignore_case: + flags = re.IGNORECASE + else: + flags = 0 # no flags + # NAs should not be considered as a match + matching_indices = input_data[input_col].str.match( + search_str, flags=flags, na=False + ) + invalid_indices = input_data[~matching_indices].index + return invalid_indices + + +def get_allele_validation_message( + invalid_indices: pd.Series, invalid_col: str, allowed_alleles: list, fileformat: str +) -> tuple: + """Creates the error/warning message for the check for invalid alleles + + Args: + invalid_indices (pd.Series): the row indices that + have invalid alleles + invalid_col (str): The column with the invalid values + allowed_alleles (list): The list of allowed allele values. + fileformat (str): Name of the fileformat + + Returns: + tuple: The errors and warnings from the allele validation + Defaults to blank strings + """ + errors = "" + warnings = "" + if len(invalid_indices) > 0: + errors = ( + f"{fileformat}: Your {invalid_col} column has invalid allele values. " + f"These are the accepted allele values: {allowed_alleles}.\n" + ) + return errors, warnings diff --git a/genie_registry/maf.py b/genie_registry/maf.py index ab8f9193..4575d189 100644 --- a/genie_registry/maf.py +++ b/genie_registry/maf.py @@ -294,6 +294,21 @@ def _validate(self, mutationDF): ) total_error.write(errors) warning.write(warnings) + + # TODO: add these lists as class attribute or global + allele_cols = ["REFERENCE_ALLELE", "TUMOR_SEQ_ALLELE1", "TUMOR_SEQ_ALLELE2"] + allowed_alleles = ['A','T','C','G','N', ' ', '-'] + for allele_col in allele_cols: + if process_functions.checkColExist(mutationDF, allele_col): + invalid_indices = validate.get_invalid_allele_rows( + mutationDF, allele_col, allowed_alleles = allowed_alleles, ignore_case = True + ) + errors, warnings = validate.get_allele_validation_message( + invalid_indices, invalid_col = allele_col, allowed_alleles = allowed_alleles, fileformat="maf" + ) + total_error.write(errors) + warning.write(warnings) + return total_error.getvalue(), warning.getvalue() def _cross_validate(self, mutationDF: pd.DataFrame) -> tuple: diff --git a/genie_registry/vcf.py b/genie_registry/vcf.py index 71ad86a4..75525bd2 100644 --- a/genie_registry/vcf.py +++ b/genie_registry/vcf.py @@ -137,6 +137,22 @@ def _validate(self, vcfdf): total_error += error warning += warn + # TODO: add this as class attribute or global + allele_col = "REF" + allowed_alleles = ["A", "T", "C", "G", "N"] + if process_functions.checkColExist(vcfdf, allele_col): + invalid_indices = validate.get_invalid_allele_rows( + vcfdf, allele_col, allowed_alleles=allowed_alleles, ignore_case=True + ) + errors, warnings = validate.get_allele_validation_message( + invalid_indices, + invalid_col=allele_col, + allowed_alleles=allowed_alleles, + fileformat="vcf", + ) + total_error += errors + warning += warnings + # No white spaces white_space = vcfdf.apply(lambda x: contains_whitespace(x), axis=1) if sum(white_space) > 0: diff --git a/tests/test_maf.py b/tests/test_maf.py index ef07d54d..9d9db949 100644 --- a/tests/test_maf.py +++ b/tests/test_maf.py @@ -94,6 +94,8 @@ def test_firstcolumn_validation(maf_class): "maf: First column header must be " "one of these: CHROMOSOME, HUGO_SYMBOL, " "TUMOR_SAMPLE_BARCODE.\n" + "maf: Your REFERENCE_ALLELE column has invalid allele values. " + "These are the accepted allele values: ['A', 'T', 'C', 'G', 'N', ' ', '-'].\n" ) assert error == expectedErrors assert warning == "" @@ -147,6 +149,10 @@ def test_errors_validation(maf_class): "This column must only be these values: 1, 2, 3, 4, 5, 6, 7, 8, 9, " "10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, X, Y, MT\n" "maf: TUMOR_SAMPLE_BARCODE must start with GENIE-SAGE\n" + "maf: Your REFERENCE_ALLELE column has invalid allele values. " + "These are the accepted allele values: ['A', 'T', 'C', 'G', 'N', ' ', '-'].\n" + "maf: Your TUMOR_SEQ_ALLELE2 column has invalid allele values. " + "These are the accepted allele values: ['A', 'T', 'C', 'G', 'N', ' ', '-'].\n" ) expectedWarnings = ( "maf: " @@ -195,6 +201,8 @@ def test_invalid_validation(maf_class): "maf: " "TUMOR_SEQ_ALLELE2 can't have any blank or null values.\n" "maf: TUMOR_SAMPLE_BARCODE must start with GENIE-SAGE\n" + "maf: Your TUMOR_SEQ_ALLELE2 column has invalid allele values. " + "These are the accepted allele values: ['A', 'T', 'C', 'G', 'N', ' ', '-'].\n" ) expectedWarnings = ( "maf: TUMOR_SEQ_ALLELE2 column contains 'NA' values, " diff --git a/tests/test_validate.py b/tests/test_validate.py index 3589166b..01b87d7d 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -768,3 +768,94 @@ def test_that_standardize_string_for_validation_returns_expected( allow_underscore=allow_underscore, ) assert test_str == expected + + +@pytest.mark.parametrize( + "input,expected_index,allowed_alleles,ignore_case", + [ + ( + pd.DataFrame( + {"REFERENCE_ALLELE": ["ACGT-G", "A-CGT ", "A", "C", "T", "G", "-", " "]} + ), + pd.Index([]), + ["A", "T", "C", "G", " ", "-"], + True, + ), + ( + pd.DataFrame({"REFERENCE_ALLELE": ["acgt-g", "acgt", " "]}), + pd.Index([]), + ["A", "T", "C", "G", " ", "-"], + True, + ), + ( + pd.DataFrame({"REFERENCE_ALLELE": ["@##", "ACGTX"]}), + pd.Index([0, 1]), + ["A", "T", "C", "G", " ", "-"], + True, + ), + ( + pd.DataFrame({"REFERENCE_ALLELE": ["XXX", "ACGT"]}), + pd.Index([0]), + ["A", "T", "C", "G", " ", "-"], + True, + ), + ( + pd.DataFrame({"REFERENCE_ALLELE": ["ACGT-G", pd.NA, None]}), + pd.Index([1, 2]), + ["A", "T", "C", "G", " ", "-"], + True, + ), + ( + pd.DataFrame({"REFERENCE_ALLELE": ["acgt-G"]}), + pd.Index([0]), + ["A", "T", "C", "G", " ", "-"], + False, + ), + ], + ids=[ + "correct_alleles", + "correct_alleles_case", + "invalid_special_chars", + "invalid_chars", + "missing_entries", + "case_not_ignored", + ], +) +def test_that_get_invalid_allele_rows_returns_expected( + input, expected_index, allowed_alleles, ignore_case +): + invalid_rows = validate.get_invalid_allele_rows( + input, + input_col="REFERENCE_ALLELE", + allowed_alleles=allowed_alleles, + ignore_case=ignore_case, + ) + assert invalid_rows.equals(expected_index) + + +@pytest.mark.parametrize( + "input_invalid_rows,expected_error,expected_warning", + [ + ( + pd.Index([1, 2, 3]), + ( + "maf: Your REFERENCE_ALLELE column has invalid allele values. " + "These are the accepted allele values: ['A', 'C', 'T', 'G', ' ', '-'].\n" + ), + "", + ), + ([], "", ""), + ], + ids=["has_invalid_alleles", "has_no_invalid_alleles"], +) +def test_that_get_allele_validation_message_returns_expected( + input_invalid_rows, expected_error, expected_warning +): + error, warning = validate.get_allele_validation_message( + input_invalid_rows, + invalid_col="REFERENCE_ALLELE", + allowed_alleles=["A", "C", "T", "G", " ", "-"], + fileformat="maf", + ) + assert error == expected_error + assert warning == expected_warning diff --git a/tests/test_vcf.py b/tests/test_vcf.py index 1c78d87f..6cff29d0 100644 --- a/tests/test_vcf.py +++ b/tests/test_vcf.py @@ -29,7 +29,7 @@ def test_validation_valid_no_samples(vcf_class): "#CHROM": ["2", "9", "12"], "POS": [69688533, 99401860, 53701241], "ID": ["AAK1", "AAED1", "AAAS"], - "REF": ["AAK1", "AAED1", "AAAS"], + "REF": ["AANT", "AACG", "AAAN"], "ALT": ["AAK1", "AAED1", "AAAS"], "QUAL": ["AAK1", "AAED1", "AAAS"], "FILTER": ["AAK1", "AAED1", "AAAS"], @@ -47,7 +47,7 @@ def test_validation_valid_one_sample_tumor(vcf_class): "#CHROM": ["2", "9", "12"], "POS": [69688533, 99401860, 53701241], "ID": ["AAK1", "AAED1", "AAAS"], - "REF": ["AAK1", "AAED1", "AAAS"], + "REF": ["AANT", "AACG", "AAAN"], "ALT": ["AAK1", "AAED1", "AAAS"], "QUAL": ["AAK1", "AAED1", "AAAS"], "FILTER": ["AAK1", "AAED1", "AAAS"], @@ -67,7 +67,7 @@ def test_validation_valid_one_sample(vcf_class): "#CHROM": ["2", "9", "12"], "POS": [69688533, 99401860, 53701241], "ID": ["AAK1", "AAED1", "AAAS"], - "REF": ["AAK1", "AAED1", "AAAS"], + "REF": ["AANT", "AACG", "AAAN"], "ALT": ["AAK1", "AAED1", "AAAS"], "QUAL": ["AAK1", "AAED1", "AAAS"], "FILTER": ["AAK1", "AAED1", "AAAS"], @@ -88,7 +88,7 @@ def test_validation_missing_format_col(vcf_class): "#CHROM": ["2", "9", "12"], "POS": [69688533, 99401860, 53701241], "ID": ["AAK1", "AAED1", "AAAS"], - "REF": ["AAK1", "AAED1", "AAAS"], + "REF": ["AANT", "AACG", "AAAN"], "ALT": ["AAK1", "AAED1", "AAAS"], "QUAL": ["AAK1", "AAED1", "AAAS"], "FILTER": ["AAK1", "AAED1", "AAAS"], @@ -107,7 +107,7 @@ def test_validation_invalid_one_sample(vcf_class): "#CHROM": ["2", "9", "12"], "POS": [69688533, 99401860, 53701241], "ID": ["AAK1", "AAED1", "AAAS"], - "REF": ["AAK1", "AAED1", "AAAS"], + "REF": ["AANT", "AACG", "AAAN"], "ALT": ["AAK1", "AAED1", "AAAS"], "QUAL": ["AAK1", "AAED1", "AAAS"], "FILTER": ["AAK1", "AAED1", "AAAS"], @@ -130,7 +130,7 @@ def test_validation_valid_two_samples(vcf_class): "#CHROM": ["2", "9", "12"], "POS": [69688533, 99401860, 53701241], "ID": ["AAK1", "AAED1", "AAAS"], - "REF": ["AAK1", "AAED1", "AAAS"], + "REF": ["AANT", "AACG", "AAAN"], "ALT": ["AAK1", "AAED1", "AAAS"], "QUAL": ["AAK1", "AAED1", "AAAS"], "FILTER": ["AAK1", "AAED1", "AAAS"], @@ -151,7 +151,7 @@ def test_validation_invalid_two_samples_tumor(vcf_class): "#CHROM": ["2", "9", "12"], "POS": [69688533, 99401860, 53701241], "ID": ["AAK1", "AAED1", "AAAS"], - "REF": ["AAK1", "AAED1", "AAAS"], + "REF": ["AANT", "AACG", "AAAN"], "ALT": ["AAK1", "AAED1", "AAAS"], "QUAL": ["AAK1", "AAED1", "AAAS"], "FILTER": ["AAK1", "AAED1", "AAAS"], @@ -172,7 +172,7 @@ def test_validation_invalid_two_samples_normal(vcf_class): "#CHROM": ["2", "9", "12"], "POS": [69688533, 99401860, 53701241], "ID": ["AAK1", "AAED1", "AAAS"], - "REF": ["AAK1", "AAED1", "AAAS"], + "REF": ["AANT", "AACG", "AAAN"], "ALT": ["AAK1", "AAED1", "AAAS"], "QUAL": ["AAK1", "AAED1", "AAAS"], "FILTER": ["AAK1", "AAED1", "AAAS"], @@ -193,7 +193,7 @@ def test_validation_invalid_white_space(vcf_class): "#CHROMM": ["2", "9", "12"], "POS": [69688533, 99401860, 53701241], "ID": ["AAK1", "AAED1", "AAAS"], - "REF": ["AAK1", "AAED1", "AAAS"], + "REF": ["AANT", "AACG", "AAAN"], "ALT": ["AAK1", "AAED1", "AAAS"], "QUAL": ["AAK1", "AAED1", "AAAS"], "FILTER": ["AAK1", "AA ED1", "AAAS"], @@ -231,6 +231,8 @@ def test_validation_invalid_content(vcf_class): "space delimited instead of tab delimited.\n" "vcf: Please double check your #CHROM column. This column must only be these values: " "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, X, Y, MT\n" + "vcf: Your REF column has invalid allele values. " + "These are the accepted allele values: ['A', 'T', 'C', 'G', 'N'].\n" ) expectedWarning = "vcf: Should not have the chr prefix in front of chromosomes.\n" assert error == expectedError From 4df61fd6e5f9c3d31ddd42204c03fcce7076132a Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Thu, 2 Nov 2023 18:03:24 -0700 Subject: [PATCH 2/8] add to be class attributes --- genie_registry/maf.py | 21 +++++++++++++-------- genie_registry/vcf.py | 15 +++++++-------- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/genie_registry/maf.py b/genie_registry/maf.py index 4575d189..bb1f2cd1 100644 --- a/genie_registry/maf.py +++ b/genie_registry/maf.py @@ -70,6 +70,8 @@ class maf(FileTypeFormat): _fileType = "maf" _process_kwargs = [] + _allele_cols = ["REFERENCE_ALLELE", "TUMOR_SEQ_ALLELE1", "TUMOR_SEQ_ALLELE2"] + _allowed_alleles = ["A", "T", "C", "G", "N", " ", "-"] def _validateFilename(self, filePath): """ @@ -294,21 +296,24 @@ def _validate(self, mutationDF): ) total_error.write(errors) warning.write(warnings) - - # TODO: add these lists as class attribute or global - allele_cols = ["REFERENCE_ALLELE", "TUMOR_SEQ_ALLELE1", "TUMOR_SEQ_ALLELE2"] - allowed_alleles = ['A','T','C','G','N', ' ', '-'] - for allele_col in allele_cols: + + for allele_col in self._allele_cols: if process_functions.checkColExist(mutationDF, allele_col): invalid_indices = validate.get_invalid_allele_rows( - mutationDF, allele_col, allowed_alleles = allowed_alleles, ignore_case = True + mutationDF, + allele_col, + allowed_alleles=self._allowed_alleles, + ignore_case=True, ) errors, warnings = validate.get_allele_validation_message( - invalid_indices, invalid_col = allele_col, allowed_alleles = allowed_alleles, fileformat="maf" + invalid_indices, + invalid_col=allele_col, + allowed_alleles=self._allowed_alleles, + fileformat=self._fileType, ) total_error.write(errors) warning.write(warnings) - + return total_error.getvalue(), warning.getvalue() def _cross_validate(self, mutationDF: pd.DataFrame) -> tuple: diff --git a/genie_registry/vcf.py b/genie_registry/vcf.py index 75525bd2..78406d92 100644 --- a/genie_registry/vcf.py +++ b/genie_registry/vcf.py @@ -18,6 +18,8 @@ class vcf(FileTypeFormat): _fileType = "vcf" _process_kwargs = [] + _allele_col = "REF" + _allowed_alleles = ["A", "T", "C", "G", "N"] def _validateFilename(self, filePath): basename = os.path.basename(filePath[0]) @@ -137,18 +139,15 @@ def _validate(self, vcfdf): total_error += error warning += warn - # TODO: add this as class attribute or global - allele_col = "REF" - allowed_alleles = ["A", "T", "C", "G", "N"] - if process_functions.checkColExist(vcfdf, allele_col): + if process_functions.checkColExist(vcfdf, self._allele_col): invalid_indices = validate.get_invalid_allele_rows( - vcfdf, allele_col, allowed_alleles=allowed_alleles, ignore_case=True + vcfdf, self._allele_col, allowed_alleles=self._allowed_alleles, ignore_case=True ) errors, warnings = validate.get_allele_validation_message( invalid_indices, - invalid_col=allele_col, - allowed_alleles=allowed_alleles, - fileformat="vcf", + invalid_col=self._allele_col, + allowed_alleles=self._allowed_alleles, + fileformat=self._fileType, ) total_error += errors warning += warnings From 370bc3b6e222505dd6a3c1a453281b0993866fa7 Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Thu, 2 Nov 2023 18:29:52 -0700 Subject: [PATCH 3/8] merge in develop changes, add linting --- genie_registry/vcf.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/genie_registry/vcf.py b/genie_registry/vcf.py index 78406d92..61b53432 100644 --- a/genie_registry/vcf.py +++ b/genie_registry/vcf.py @@ -141,7 +141,10 @@ def _validate(self, vcfdf): if process_functions.checkColExist(vcfdf, self._allele_col): invalid_indices = validate.get_invalid_allele_rows( - vcfdf, self._allele_col, allowed_alleles=self._allowed_alleles, ignore_case=True + vcfdf, + self._allele_col, + allowed_alleles=self._allowed_alleles, + ignore_case=True, ) errors, warnings = validate.get_allele_validation_message( invalid_indices, From e84124ac420c19410716301b54067d1498afa168 Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Tue, 7 Nov 2023 13:34:54 -0800 Subject: [PATCH 4/8] revamp validation msg and function for combination and individually used allele values --- genie/validate.py | 39 +++++-- genie_registry/maf.py | 10 +- genie_registry/vcf.py | 14 ++- tests/test_maf.py | 16 ++- tests/test_validate.py | 227 +++++++++++++++++++++++++++-------------- tests/test_vcf.py | 4 +- 6 files changed, 214 insertions(+), 96 deletions(-) diff --git a/genie/validate.py b/genie/validate.py index d78bf073..733749b5 100644 --- a/genie/validate.py +++ b/genie/validate.py @@ -421,8 +421,10 @@ def standardize_string_for_validation( def get_invalid_allele_rows( input_data: pd.DataFrame, input_col: str, - allowed_alleles: list, + allowed_comb_alleles: list, + allowed_ind_alleles: list, ignore_case: bool = False, + allow_na: bool = False, ) -> pd.Index: """ Find invalid indices in a DataFrame column based on allowed allele values. @@ -430,28 +432,43 @@ def get_invalid_allele_rows( Args: input_data (pd.DataFrame): The DataFrame to search. input_col (str): The name of the column to check. - allowed_alleles (list): The list of allowed allele values. + allowed_comb_alleles (list): The list of allowed allele values + (can appear in combinations or individually) + allowed_ind_alleles (list): The list of allowed allele values + (can only appear individually) ignore_case (bool, optional): whether to perform case-insensitive matching + allow_na (bool, optional): whether to allow NAs to be an allowed allele + value or not. Returns: pd.Index: A pandas index object indicating the row indices that don't match the allowed alleles """ - search_str = rf"^[{''.join(allowed_alleles)}]+$" + search_str = "" + if allowed_comb_alleles: + search_str += f'^[{re.escape("".join(allowed_comb_alleles))}]+$' + + if allowed_ind_alleles: + search_str += f'|^[{re.escape("".join(allowed_ind_alleles))}]+$' + if ignore_case: flags = re.IGNORECASE else: flags = 0 # no flags - # NAs should not be considered as a match + matching_indices = input_data[input_col].str.match( - search_str, flags=flags, na=False + search_str, flags=flags, na=allow_na ) invalid_indices = input_data[~matching_indices].index return invalid_indices def get_allele_validation_message( - invalid_indices: pd.Series, invalid_col: str, allowed_alleles: list, fileformat: str + invalid_indices: pd.Series, + invalid_col: str, + allowed_comb_alleles: list, + allowed_ind_alleles: list, + fileformat: str, ) -> tuple: """Creates the error/warning message for the check for invalid alleles @@ -459,7 +476,10 @@ def get_allele_validation_message( invalid_indices (pd.Series): the row indices that have invalid alleles invalid_col (str): The column with the invalid values - allowed_alleles (list): The list of allowed allele values. + allowed_comb_alleles (list): The list of allowed allele values + (can appear in combinations or individually) + allowed_ind_alleles (list): The list of allowed allele values + (can only appear individually) fileformat (str): Name of the fileformat Returns: @@ -471,6 +491,9 @@ def get_allele_validation_message( if len(invalid_indices) > 0: errors = ( f"{fileformat}: Your {invalid_col} column has invalid allele values. " - f"These are the accepted allele values: {allowed_alleles}.\n" + "This is the list of accepted allele values that can appear individually " + f"or in combination with each other: {','.join(allowed_comb_alleles)}.\n" + "This is the list of accepted allele values that can only appear individually: " + f"{','.join(allowed_ind_alleles)}\n" ) return errors, warnings diff --git a/genie_registry/maf.py b/genie_registry/maf.py index bb1f2cd1..c53e2529 100644 --- a/genie_registry/maf.py +++ b/genie_registry/maf.py @@ -71,7 +71,8 @@ class maf(FileTypeFormat): _process_kwargs = [] _allele_cols = ["REFERENCE_ALLELE", "TUMOR_SEQ_ALLELE1", "TUMOR_SEQ_ALLELE2"] - _allowed_alleles = ["A", "T", "C", "G", "N", " ", "-"] + _allowed_comb_alleles = ["A", "T", "C", "G", "N"] + _allowed_ind_alleles = ['-'] def _validateFilename(self, filePath): """ @@ -302,13 +303,16 @@ def _validate(self, mutationDF): invalid_indices = validate.get_invalid_allele_rows( mutationDF, allele_col, - allowed_alleles=self._allowed_alleles, + allowed_comb_alleles=self._allowed_comb_alleles, + allowed_ind_alleles=self._allowed_ind_alleles, ignore_case=True, + allow_na=False ) errors, warnings = validate.get_allele_validation_message( invalid_indices, invalid_col=allele_col, - allowed_alleles=self._allowed_alleles, + allowed_comb_alleles=self._allowed_comb_alleles, + allowed_ind_alleles=self._allowed_ind_alleles, fileformat=self._fileType, ) total_error.write(errors) diff --git a/genie_registry/vcf.py b/genie_registry/vcf.py index 61b53432..fe430b84 100644 --- a/genie_registry/vcf.py +++ b/genie_registry/vcf.py @@ -19,7 +19,8 @@ class vcf(FileTypeFormat): _process_kwargs = [] _allele_col = "REF" - _allowed_alleles = ["A", "T", "C", "G", "N"] + _allowed_comb_alleles = ["A", "T", "C", "G", "N"] + _allowed_ind_alleles = [] def _validateFilename(self, filePath): basename = os.path.basename(filePath[0]) @@ -142,14 +143,17 @@ def _validate(self, vcfdf): if process_functions.checkColExist(vcfdf, self._allele_col): invalid_indices = validate.get_invalid_allele_rows( vcfdf, - self._allele_col, - allowed_alleles=self._allowed_alleles, + input_col=self._allele_col, + allowed_comb_alleles=self._allowed_comb_alleles, + allowed_ind_alleles=self._allowed_ind_alleles, ignore_case=True, - ) + allow_na=False + ) errors, warnings = validate.get_allele_validation_message( invalid_indices, invalid_col=self._allele_col, - allowed_alleles=self._allowed_alleles, + allowed_comb_alleles=self._allowed_comb_alleles, + allowed_ind_alleles=self._allowed_ind_alleles, fileformat=self._fileType, ) total_error += errors diff --git a/tests/test_maf.py b/tests/test_maf.py index 9d9db949..541e6be8 100644 --- a/tests/test_maf.py +++ b/tests/test_maf.py @@ -95,7 +95,9 @@ def test_firstcolumn_validation(maf_class): "one of these: CHROMOSOME, HUGO_SYMBOL, " "TUMOR_SAMPLE_BARCODE.\n" "maf: Your REFERENCE_ALLELE column has invalid allele values. " - "These are the accepted allele values: ['A', 'T', 'C', 'G', 'N', ' ', '-'].\n" + "This is the list of accepted allele values that can appear individually " + f"or in combination with each other: A,T,C,G,N.\n" + "This is the list of accepted allele values that can only appear individually: -\n" ) assert error == expectedErrors assert warning == "" @@ -150,9 +152,13 @@ def test_errors_validation(maf_class): "10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, X, Y, MT\n" "maf: TUMOR_SAMPLE_BARCODE must start with GENIE-SAGE\n" "maf: Your REFERENCE_ALLELE column has invalid allele values. " - "These are the accepted allele values: ['A', 'T', 'C', 'G', 'N', ' ', '-'].\n" + "This is the list of accepted allele values that can appear individually " + "or in combination with each other: A,T,C,G,N.\n" + "This is the list of accepted allele values that can only appear individually: -\n" "maf: Your TUMOR_SEQ_ALLELE2 column has invalid allele values. " - "These are the accepted allele values: ['A', 'T', 'C', 'G', 'N', ' ', '-'].\n" + "This is the list of accepted allele values that can appear individually " + "or in combination with each other: A,T,C,G,N.\n" + "This is the list of accepted allele values that can only appear individually: -\n" ) expectedWarnings = ( "maf: " @@ -202,7 +208,9 @@ def test_invalid_validation(maf_class): "TUMOR_SEQ_ALLELE2 can't have any blank or null values.\n" "maf: TUMOR_SAMPLE_BARCODE must start with GENIE-SAGE\n" "maf: Your TUMOR_SEQ_ALLELE2 column has invalid allele values. " - "These are the accepted allele values: ['A', 'T', 'C', 'G', 'N', ' ', '-'].\n" + "This is the list of accepted allele values that can appear individually " + "or in combination with each other: A,T,C,G,N.\n" + "This is the list of accepted allele values that can only appear individually: -\n" ) expectedWarnings = ( "maf: TUMOR_SEQ_ALLELE2 column contains 'NA' values, " diff --git a/tests/test_validate.py b/tests/test_validate.py index bd7ea2af..fa52bb4a 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -770,92 +770,169 @@ def test_that_standardize_string_for_validation_returns_expected( assert test_str == expected -@pytest.mark.parametrize( - "input,expected_index,allowed_alleles,ignore_case", - [ - ( - pd.DataFrame( - {"REFERENCE_ALLELE": ["ACGT-G", "A-CGT ", "A", "C", "T", "G", "-", " "]} +def get_invalid_allele_rows_test_cases(): + return [ + { + "name": "correct_alleles", + "input": pd.DataFrame( + { + "REFERENCE_ALLELE": [ + "NANANANA", + "ACGTN", + "A", + "C", + "T", + "G", + "-", + "N", + ] + } ), - pd.Index([]), - ["A", "T", "C", "G", " ", "-"], - True, - ), - ( - pd.DataFrame({"REFERENCE_ALLELE": ["acgt-g", "acgt", " "]}), - pd.Index([]), - ["A", "T", "C", "G", " ", "-"], - True, - ), - ( - pd.DataFrame({"REFERENCE_ALLELE": ["@##", "ACGTX"]}), - pd.Index([0, 1]), - ["A", "T", "C", "G", " ", "-"], - True, - ), - ( - pd.DataFrame({"REFERENCE_ALLELE": ["XXX", "ACGT"]}), - pd.Index([0]), - ["A", "T", "C", "G", " ", "-"], - True, - ), - ( - pd.DataFrame({"REFERENCE_ALLELE": ["ACGT-G", pd.NA, None]}), - pd.Index([1, 2]), - ["A", "T", "C", "G", " ", "-"], - True, - ), - ( - pd.DataFrame({"REFERENCE_ALLELE": ["acgt-G"]}), - pd.Index([0]), - ["A", "T", "C", "G", " ", "-"], - False, - ), - ], - ids=[ - "correct_alleles", - "correct_alleles_case", - "invalid_special_chars", - "invalid_chars", - "missing_entries", - "case_not_ignored", - ], + "expected_index": pd.Index([]), + "allowed_comb_alleles": ["A", "T", "C", "G", "N"], + "allowed_ind_alleles": ["-"], + "ignore_case": True, + "allow_na": True, + }, + { + "name": "incorrect_alleles", + "input": pd.DataFrame({"REFERENCE_ALLELE": ["@##", "ACGTX", "XXX"]}), + "expected_index": pd.Index([0, 1, 2]), + "allowed_comb_alleles": ["A", "T", "C", "G"], + "allowed_ind_alleles": [], + "ignore_case": True, + "allow_na": True, + }, + { + "name": "case_ignored", + "input": pd.DataFrame({"REFERENCE_ALLELE": ["acgtg", "acgt", "-", "a"]}), + "expected_index": pd.Index([]), + "allowed_comb_alleles": ["A", "T", "C", "G"], + "allowed_ind_alleles": ["-"], + "ignore_case": True, + "allow_na": True, + }, + { + "name": "case_not_ignored", + "input": pd.DataFrame({"REFERENCE_ALLELE": ["acgt-G"]}), + "expected_index": pd.Index([0]), + "allowed_comb_alleles": ["A", "T", "C", "G", "-"], + "allowed_ind_alleles": [], + "ignore_case": False, + "allow_na": True, + }, + { + "name": "no_ind_alleles_incorrect", + "input": pd.DataFrame({"REFERENCE_ALLELE": ["ACG-T", "ACGT", "G-CT"]}), + "expected_index": pd.Index([0, 2]), + "allowed_comb_alleles": ["A", "T", "C", "G"], + "allowed_ind_alleles": [], + "ignore_case": True, + "allow_na": True, + }, + { + "name": "no_ind_alleles_correct", + "input": pd.DataFrame({"REFERENCE_ALLELE": ["ACT", "ACGT", "G"]}), + "expected_index": pd.Index([]), + "allowed_comb_alleles": ["A", "T", "C", "G"], + "allowed_ind_alleles": [], + "ignore_case": True, + "allow_na": True, + }, + { + "name": "missing_entries_not_allowed", + "input": pd.DataFrame({"REFERENCE_ALLELE": ["ACGT-G", pd.NA, None]}), + "expected_index": pd.Index([1, 2]), + "allowed_comb_alleles": ["A", "T", "C", "G", "-"], + "allowed_ind_alleles": [], + "ignore_case": True, + "allow_na": False, + }, + { + "name": "missing_entries_allowed", + "input": pd.DataFrame({"REFERENCE_ALLELE": ["ACGT-G", pd.NA, None]}), + "expected_index": pd.Index([]), + "allowed_comb_alleles": ["A", "T", "C", "G", "-"], + "allowed_ind_alleles": [], + "ignore_case": True, + "allow_na": True, + }, + { + "name": "no_specified_alleles_values", + "input": pd.DataFrame({"REFERENCE_ALLELE": ["ACGT-G", "ACGF", "B"]}), + "expected_index": pd.Index([]), + "allowed_comb_alleles": [], + "allowed_ind_alleles": [], + "ignore_case": True, + "allow_na": True, + }, + ] + + +@pytest.mark.parametrize( + "test_cases", get_invalid_allele_rows_test_cases(), ids=lambda x: x["name"] ) -def test_that_get_invalid_allele_rows_returns_expected( - input, expected_index, allowed_alleles, ignore_case -): +def test_that_get_invalid_allele_rows_returns_expected(test_cases): invalid_rows = validate.get_invalid_allele_rows( - input, + test_cases["input"], input_col="REFERENCE_ALLELE", - allowed_alleles=allowed_alleles, - ignore_case=ignore_case, + allowed_comb_alleles=test_cases["allowed_comb_alleles"], + allowed_ind_alleles=test_cases["allowed_ind_alleles"], + ignore_case=test_cases["ignore_case"], + allow_na=test_cases["allow_na"], ) - assert invalid_rows.equals(expected_index) + assert invalid_rows.equals(test_cases["expected_index"]) -@pytest.mark.parametrize( - "input_invalid_rows,expected_error,expected_warning", - [ - ( - pd.Index([1, 2, 3]), - ( +def get_allele_validation_message_test_cases(): + return [ + { + "name": "has_invalid_alleles", + "input_invalid_rows": pd.Index([1, 2, 3]), + "allowed_comb_alleles": ["A", "C", "T", "G"], + "allowed_ind_alleles": ["-"], + "expected_error": ( "maf: Your REFERENCE_ALLELE column has invalid allele values. " - "These are the accepted allele values: ['A', 'C', 'T', 'G', ' ', '-'].\n" + "This is the list of accepted allele values that can appear individually " + "or in combination with each other: A,C,T,G.\n" + "This is the list of accepted allele values that can only appear individually: -\n" ), - "", - ), - ([], "", ""), - ], - ids=["has_invalid_alleles", "has_no_invalid_alleles"], + "expected_warning": "", + }, + { + "name": "has_no_invalid_alleles", + "input_invalid_rows": [], + "allowed_comb_alleles": [], + "allowed_ind_alleles": [], + "expected_error": "", + "expected_warning": "", + }, + { + "name": "has_invalid_alleles_empty_ind_alleles", + "input_invalid_rows": pd.Index([1, 2, 3]), + "allowed_comb_alleles": ["A", "C", "T", "G"], + "allowed_ind_alleles": [], + "expected_error": ( + "maf: Your REFERENCE_ALLELE column has invalid allele values. " + "This is the list of accepted allele values that can appear individually " + "or in combination with each other: A,C,T,G.\n" + "This is the list of accepted allele values that can only appear individually: \n" + ), + "expected_warning": "", + }, + ] + + +@pytest.mark.parametrize( + "test_cases", get_allele_validation_message_test_cases(), ids=lambda x: x["name"] ) -def test_that_get_allele_validation_message_returns_expected( - input_invalid_rows, expected_error, expected_warning -): +def test_that_get_allele_validation_message_returns_expected(test_cases): error, warning = validate.get_allele_validation_message( - input_invalid_rows, + invalid_indices=test_cases["input_invalid_rows"], invalid_col="REFERENCE_ALLELE", - allowed_alleles=["A", "C", "T", "G", " ", "-"], + allowed_comb_alleles=test_cases["allowed_comb_alleles"], + allowed_ind_alleles=test_cases["allowed_ind_alleles"], fileformat="maf", ) - assert error == expected_error - assert warning == expected_warning + assert error == test_cases["expected_error"] + assert warning == test_cases["expected_warning"] diff --git a/tests/test_vcf.py b/tests/test_vcf.py index 6cff29d0..d8bbabf5 100644 --- a/tests/test_vcf.py +++ b/tests/test_vcf.py @@ -232,7 +232,9 @@ def test_validation_invalid_content(vcf_class): "vcf: Please double check your #CHROM column. This column must only be these values: " "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, X, Y, MT\n" "vcf: Your REF column has invalid allele values. " - "These are the accepted allele values: ['A', 'T', 'C', 'G', 'N'].\n" + "This is the list of accepted allele values that can appear individually " + "or in combination with each other: A,T,C,G,N.\n" + "This is the list of accepted allele values that can only appear individually: \n" ) expectedWarning = "vcf: Should not have the chr prefix in front of chromosomes.\n" assert error == expectedError From 4fea6951abc9b91009851aee9df2874da433cf99 Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Tue, 7 Nov 2023 13:39:03 -0800 Subject: [PATCH 5/8] remove NA check in _check_allele_col --- genie_registry/maf.py | 8 -------- tests/test_maf.py | 22 +--------------------- 2 files changed, 1 insertion(+), 29 deletions(-) diff --git a/genie_registry/maf.py b/genie_registry/maf.py index c53e2529..62d7bdd1 100644 --- a/genie_registry/maf.py +++ b/genie_registry/maf.py @@ -47,14 +47,6 @@ def _check_allele_col(df, col): error = "" warning = "" if col_exist: - # CHECK: The value "NA" can't be used as a placeholder - if sum(df[col].fillna("") == "NA") > 0: - warning = ( - "maf: " - f"{col} column contains 'NA' values, " - "which cannot be placeholders for blank values. " - "Please put in empty strings for blank values.\n" - ) # CHECK: There can't be any null values if sum(df[col].isnull()) > 0: error = f"maf: {col} can't have any blank or null values.\n" diff --git a/tests/test_maf.py b/tests/test_maf.py index 541e6be8..309819e1 100644 --- a/tests/test_maf.py +++ b/tests/test_maf.py @@ -165,10 +165,6 @@ def test_errors_validation(maf_class): "Does not have the column headers that can give " "extra information to the processed maf: " "T_REF_COUNT, N_DEPTH.\n" - "maf: " - "REFERENCE_ALLELE column contains 'NA' values, " - "which cannot be placeholders for blank values. " - "Please put in empty strings for blank values.\n" ) assert error == expectedErrors @@ -213,9 +209,6 @@ def test_invalid_validation(maf_class): "This is the list of accepted allele values that can only appear individually: -\n" ) expectedWarnings = ( - "maf: TUMOR_SEQ_ALLELE2 column contains 'NA' values, " - "which cannot be placeholders for blank values. " - "Please put in empty strings for blank values.\n" "maf: Does not have the column headers that can give " "extra information to the processed maf: T_REF_COUNT.\n" ) @@ -226,25 +219,12 @@ def test_invalid_validation(maf_class): @pytest.mark.parametrize("col", ["temp", "REFERENCE_ALLELE"]) def test_noerror__check_allele_col(col): """Test error and warning is an empty string if REF col isn't passed in""" - df = pd.DataFrame(dict(REFERENCE_ALLELE=["A", "A"])) + df = pd.DataFrame(dict(REFERENCE_ALLELE=["NA", "A"])) error, warning = genie_registry.maf._check_allele_col(df, col) assert error == "" assert warning == "" -def test_warning__check_allele_col(): - """Test warning occurs when 'NA' string is passed in""" - df = pd.DataFrame(dict(TEMP=["NA", "A"])) - error, warning = genie_registry.maf._check_allele_col(df, "TEMP") - assert error == "" - assert warning == ( - "maf: " - "TEMP column contains 'NA' values, " - "which cannot be placeholders for blank values. " - "Please put in empty strings for blank values.\n" - ) - - def test_error__check_allele_col(): """Test error occurs when blank allele is passed in""" df = pd.DataFrame(dict(TEMP=[float("nan"), "A"])) From 780ee66b8f303534cbba17e2a649d6351b0ef2fa Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Tue, 7 Nov 2023 14:38:13 -0800 Subject: [PATCH 6/8] linting, standardizing --- genie_registry/maf.py | 4 ++-- genie_registry/vcf.py | 37 +++++++++++++++++++------------------ tests/test_maf.py | 8 +++++++- 3 files changed, 28 insertions(+), 21 deletions(-) diff --git a/genie_registry/maf.py b/genie_registry/maf.py index 62d7bdd1..3adcc2ce 100644 --- a/genie_registry/maf.py +++ b/genie_registry/maf.py @@ -64,7 +64,7 @@ class maf(FileTypeFormat): _process_kwargs = [] _allele_cols = ["REFERENCE_ALLELE", "TUMOR_SEQ_ALLELE1", "TUMOR_SEQ_ALLELE2"] _allowed_comb_alleles = ["A", "T", "C", "G", "N"] - _allowed_ind_alleles = ['-'] + _allowed_ind_alleles = ["-"] def _validateFilename(self, filePath): """ @@ -298,7 +298,7 @@ def _validate(self, mutationDF): allowed_comb_alleles=self._allowed_comb_alleles, allowed_ind_alleles=self._allowed_ind_alleles, ignore_case=True, - allow_na=False + allow_na=False, ) errors, warnings = validate.get_allele_validation_message( invalid_indices, diff --git a/genie_registry/vcf.py b/genie_registry/vcf.py index fe430b84..cf381086 100644 --- a/genie_registry/vcf.py +++ b/genie_registry/vcf.py @@ -18,7 +18,7 @@ class vcf(FileTypeFormat): _fileType = "vcf" _process_kwargs = [] - _allele_col = "REF" + _allele_cols = ["REF"] _allowed_comb_alleles = ["A", "T", "C", "G", "N"] _allowed_ind_alleles = [] @@ -140,24 +140,25 @@ def _validate(self, vcfdf): total_error += error warning += warn - if process_functions.checkColExist(vcfdf, self._allele_col): - invalid_indices = validate.get_invalid_allele_rows( - vcfdf, - input_col=self._allele_col, - allowed_comb_alleles=self._allowed_comb_alleles, - allowed_ind_alleles=self._allowed_ind_alleles, - ignore_case=True, - allow_na=False + for allele_col in self._allele_cols: + if process_functions.checkColExist(vcfdf, allele_col): + invalid_indices = validate.get_invalid_allele_rows( + vcfdf, + input_col=allele_col, + allowed_comb_alleles=self._allowed_comb_alleles, + allowed_ind_alleles=self._allowed_ind_alleles, + ignore_case=True, + allow_na=False, ) - errors, warnings = validate.get_allele_validation_message( - invalid_indices, - invalid_col=self._allele_col, - allowed_comb_alleles=self._allowed_comb_alleles, - allowed_ind_alleles=self._allowed_ind_alleles, - fileformat=self._fileType, - ) - total_error += errors - warning += warnings + errors, warnings = validate.get_allele_validation_message( + invalid_indices, + invalid_col=allele_col, + allowed_comb_alleles=self._allowed_comb_alleles, + allowed_ind_alleles=self._allowed_ind_alleles, + fileformat=self._fileType, + ) + total_error += errors + warning += warnings # No white spaces white_space = vcfdf.apply(lambda x: contains_whitespace(x), axis=1) diff --git a/tests/test_maf.py b/tests/test_maf.py index 309819e1..71d61e64 100644 --- a/tests/test_maf.py +++ b/tests/test_maf.py @@ -19,7 +19,13 @@ def valid_maf_df(): dict( CHROMOSOME=[1, 2, 3, 4, 5], START_POSITION=[1, 2, 3, 4, 2], - REFERENCE_ALLELE=["A", "A", "A", "A", "A"], + REFERENCE_ALLELE=[ + "C", + "G", + "NA", + "-", + "TAAAGATCGTACAGAA", + ], TUMOR_SAMPLE_BARCODE=[ "GENIE-SAGE-ID1-1", "GENIE-SAGE-ID1-1", From da74efcf48bc8e12473c42063b0490ca681958b0 Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Tue, 7 Nov 2023 17:18:53 -0800 Subject: [PATCH 7/8] add allele validation special handling for all nas and non-str cols --- genie/validate.py | 24 +++++++++++++++++------ tests/test_validate.py | 44 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 6 deletions(-) diff --git a/genie/validate.py b/genie/validate.py index 733749b5..049cde88 100644 --- a/genie/validate.py +++ b/genie/validate.py @@ -424,7 +424,7 @@ def get_invalid_allele_rows( allowed_comb_alleles: list, allowed_ind_alleles: list, ignore_case: bool = False, - allow_na: bool = False, + allow_na: bool = False ) -> pd.Index: """ Find invalid indices in a DataFrame column based on allowed allele values. @@ -439,7 +439,6 @@ def get_invalid_allele_rows( ignore_case (bool, optional): whether to perform case-insensitive matching allow_na (bool, optional): whether to allow NAs to be an allowed allele value or not. - Returns: pd.Index: A pandas index object indicating the row indices that don't match the allowed alleles @@ -456,10 +455,23 @@ def get_invalid_allele_rows( else: flags = 0 # no flags - matching_indices = input_data[input_col].str.match( - search_str, flags=flags, na=allow_na - ) - invalid_indices = input_data[~matching_indices].index + # special handling for all NA column + is_all_na = pd.isna(input_data[input_col]).all() + if is_all_na and allow_na: + invalid_indices = pd.Index([]) + elif is_all_na and not allow_na: + invalid_indices = input_data.index + else: + # convert numeric cols to string while preserving NAs in order to use str.match + transformed_data = input_data.copy() + transformed_data[input_col] = transform._convert_col_with_nas_to_str( + transformed_data, input_col + ) + + matching_indices = transformed_data[input_col].str.match( + search_str, flags=flags, na=allow_na + ) + invalid_indices = transformed_data[~matching_indices].index return invalid_indices diff --git a/tests/test_validate.py b/tests/test_validate.py index fa52bb4a..4e16cfee 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -866,6 +866,50 @@ def get_invalid_allele_rows_test_cases(): "ignore_case": True, "allow_na": True, }, + { + "name": "float_nas_not_allowed", + "input": pd.DataFrame( + {"REFERENCE_ALLELE": [1.5, 2.0, float("nan"), 3.5, 4.0]} + ), + "expected_index": pd.Index([0, 1, 2, 3, 4]), + "allowed_comb_alleles": ["A", "T", "C", "G", "-"], + "allowed_ind_alleles": [], + "ignore_case": True, + "allow_na": False, + }, + { + "name": "float_nas_allowed", + "input": pd.DataFrame( + {"REFERENCE_ALLELE": [1.5, 2.0, float("nan"), 3.5, 4.0]} + ), + "expected_index": pd.Index([0, 1, 3, 4]), + "allowed_comb_alleles": ["A", "T", "C", "G", "-"], + "allowed_ind_alleles": [], + "ignore_case": True, + "allow_na": True, + }, + { + "name": "all_missing_nas_allowed", + "input": pd.DataFrame( + {"REFERENCE_ALLELE": [float("nan"), float("nan"), float("nan")]} + ), + "expected_index": pd.Index([]), + "allowed_comb_alleles": ["A", "T", "C", "G", "-"], + "allowed_ind_alleles": [], + "ignore_case": True, + "allow_na": True, + }, + { + "name": "all_missing_nas_not_allowed", + "input": pd.DataFrame( + {"REFERENCE_ALLELE": [float("nan"), float("nan"), float("nan")]} + ), + "expected_index": pd.Index([0, 1, 2]), + "allowed_comb_alleles": ["A", "T", "C", "G", "-"], + "allowed_ind_alleles": [], + "ignore_case": True, + "allow_na": False, + }, ] From 4d3222db4141ea17fae816cc4351641d736049fd Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Tue, 7 Nov 2023 17:36:33 -0800 Subject: [PATCH 8/8] lint --- genie/validate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/genie/validate.py b/genie/validate.py index 049cde88..5bf0f17a 100644 --- a/genie/validate.py +++ b/genie/validate.py @@ -424,7 +424,7 @@ def get_invalid_allele_rows( allowed_comb_alleles: list, allowed_ind_alleles: list, ignore_case: bool = False, - allow_na: bool = False + allow_na: bool = False, ) -> pd.Index: """ Find invalid indices in a DataFrame column based on allowed allele values.