diff --git a/genie/process_functions.py b/genie/process_functions.py index adea2f4f..6f9f1572 100644 --- a/genie/process_functions.py +++ b/genie/process_functions.py @@ -980,3 +980,93 @@ def create_missing_columns(dataset: pd.DataFrame, schema: dict) -> pd.Series: elif data_type == "boolean": dataset[column] = dataset[column].astype(pd.BooleanDtype()) return dataset[list(schema.keys())] + + +def get_row_indices_for_invalid_column_values(df: pd.DataFrame, col: str, possible_values: list, na_allowed: bool = False, sep: Optional[str] = None) -> pd.Index: + """This function checks the column values against possible_values and returns row indices of invalid rows. + Currently, this function is only used in assay.py + + Args: + df (pd.DataFrame): Input dataframe + col (str): The column to be checked + possible_values (list): The list of possible values + na_allowed (bool, optional): If NA is allowed. Defaults to False. + sep (Optional[str], optional): The string separator. Defaults to None. + + Returns: + pd.Index: The row indices of the rows with values that are not in possible_values. + """ + if na_allowed: + # this is only useful for dropping NAs for individual values rather than value_list + check_values = df[col].dropna() + else: + check_values = df[col] + if sep: + # for columns contain lists of values + check_values = check_values.apply(lambda x: all(substring in possible_values for substring in x.split(sep))) + else: + check_values = check_values.apply(lambda x: x in possible_values) + return check_values[check_values == False].index + +def get_message_for_invalid_column_value(col: str, filename: str, invalid_indices: pd.Index, possible_values: list) -> tuple: + """This function returns the error and warning messages if the target column has rows with invalid values. + Currently, this function is only used in assay.py + + Args: + col (str): The column to be checked + filename (str): The file name + invalid_indices (pd.Index): The row indices of the rows with invalid values + possible_values (list): The list of possible values + + Returns: + tuple: warning, error + """ + warning = "" + error = "" + # check the validity of values in the column + # concatenated possible values. This is done because of pandas typing. An integer column with one NA/blank value will be cast as a double. + possible_values = ", ".join([str(value).replace(".0", "")for value in possible_values]) + if len(invalid_indices) > 0: + error = (f"{filename}: Please double check your {col} column. Valid values are {possible_values}. " + f"You have {len(invalid_indices)} row(s) in your file where {col} column contains invalid values. " + f"The row(s) this occurs in are: {invalid_indices.tolist()}. Please correct.\n") + return (warning, error) + + +def check_column_and_values_row_specific(df: pd.DataFrame, col: str, possible_values: list, filename: str, na_allowed: bool = False, required=False, sep: Optional[str] = None) -> tuple: + """This function checks if the column exists and checks if the values in the column have the valid values. + Currently, this function is only used in assay.py + + Args: + df (pd.DataFrame): Input dataframe + col (str): The column to be checked + possible_values (list): The list of possible values + filename (str): The file name + na_allowed (bool, optional): If NA is allowed. Defaults to False. + required (bool, optional): If the column is required. Defaults to False. + sep (Optional[str], optional): The string separator. Defaults to None. + + Returns: + tuple: warning, error + """ + warning = "" + error = "" + # check the existence of the column + have_column = checkColExist(df, col) + if not have_column: + if required: + error = "{filename}: Must have {col} column.\n".format( + filename=filename, col=col + ) + else: + warning = ( + "{filename}: Doesn't have {col} column. " + "This column will be added.\n".format(filename=filename, col=col) + ) + else: + # get the row indices + invalid_indices = get_row_indices_for_invalid_column_values(df, col, possible_values, na_allowed, sep) + # generate validation message + warning, error = get_message_for_invalid_column_value(col, filename, invalid_indices, possible_values) + + return (warning, error) \ No newline at end of file diff --git a/genie_registry/assay.py b/genie_registry/assay.py index 4b04b7bc..3a58bfa6 100644 --- a/genie_registry/assay.py +++ b/genie_registry/assay.py @@ -178,7 +178,7 @@ def _validate(self, assay_info_df): read_group_dict = process_functions.get_gdc_data_dictionary("read_group") read_group_headers = read_group_dict["properties"] - warn, error = process_functions.check_col_and_values( + warn, error = process_functions.check_column_and_values_row_specific( assay_info_df, "is_paired_end", [True, False], @@ -188,7 +188,7 @@ def _validate(self, assay_info_df): warning += warn total_error += error - warn, error = process_functions.check_col_and_values( + warn, error = process_functions.check_column_and_values_row_specific( assay_info_df, "library_selection", read_group_headers["library_selection"]["enum"], @@ -198,7 +198,7 @@ def _validate(self, assay_info_df): warning += warn total_error += error - warn, error = process_functions.check_col_and_values( + warn, error = process_functions.check_column_and_values_row_specific( assay_info_df, "library_strategy", ["Targeted Sequencing", "WXS"], @@ -208,7 +208,7 @@ def _validate(self, assay_info_df): warning += warn total_error += error - warn, error = process_functions.check_col_and_values( + warn, error = process_functions.check_column_and_values_row_specific( assay_info_df, "platform", read_group_headers["platform"]["enum"], @@ -220,7 +220,7 @@ def _validate(self, assay_info_df): instrument_model = read_group_headers["instrument_model"]["enum"] instrument_model.extend(["Illumina NovaSeq 6000", None]) - warn, error = process_functions.check_col_and_values( + warn, error = process_functions.check_column_and_values_row_specific( assay_info_df, "instrument_model", instrument_model, @@ -231,7 +231,7 @@ def _validate(self, assay_info_df): total_error += error # target_capture_kit = read_group_headers['target_capture_kit']['enum'] - # warn, error = process_functions.check_col_and_values( + # warn, error = process_functions.check_column_and_values_row_specific( # assay_info_df, # 'target_capture_kit', # target_capture_kit, @@ -266,7 +266,7 @@ def _validate(self, assay_info_df): "3'Flank", None, ] - warn, error = process_functions.check_col_and_values( + warn, error = process_functions.check_column_and_values_row_specific( assay_info_df, "variant_classifications", variant_classes, @@ -329,7 +329,7 @@ def _validate(self, assay_info_df): "gene_padding is by default 10 if not specified.\n" ) - warn, error = process_functions.check_col_and_values( + warn, error = process_functions.check_column_and_values_row_specific( assay_info_df, "calling_strategy", ["tumor_only", "tumor_normal", "plasma_normal"], @@ -364,7 +364,7 @@ def _validate(self, assay_info_df): "intragenic_cna", "structural_variants", ] - warn, error = process_functions.check_col_and_values( + warn, error = process_functions.check_column_and_values_row_specific( assay_info_df, "alteration_types", alteration_types, @@ -376,7 +376,7 @@ def _validate(self, assay_info_df): total_error += error preservation_technique = ["FFPE", "fresh_frozen", "NA"] - warn, error = process_functions.check_col_and_values( + warn, error = process_functions.check_column_and_values_row_specific( assay_info_df, "preservation_technique", preservation_technique, @@ -388,7 +388,7 @@ def _validate(self, assay_info_df): total_error += error coverage = ["hotspot_regions", "coding_exons", "introns", "promoters"] - warn, error = process_functions.check_col_and_values( + warn, error = process_functions.check_column_and_values_row_specific( assay_info_df, "coverage", coverage, diff --git a/tests/test_process_functions.py b/tests/test_process_functions.py index 72f72663..89b8fdb5 100644 --- a/tests/test_process_functions.py +++ b/tests/test_process_functions.py @@ -1,18 +1,13 @@ -from unittest.mock import Mock, patch import uuid +from unittest.mock import Mock, patch import pandas as pd -from pandas.api.types import ( - is_bool_dtype, - is_float_dtype, - is_integer_dtype, - is_string_dtype, -) -from pandas.testing import assert_frame_equal import pytest import synapseclient - from genie import process_functions +from pandas.api.types import (is_bool_dtype, is_float_dtype, is_integer_dtype, + is_string_dtype) +from pandas.testing import assert_frame_equal DATABASE_DF = pd.DataFrame( { @@ -715,3 +710,216 @@ def test_that_create_missing_columns_returns_expected_output_with_multi_col_df() assert result.isna().sum().sum() == 11 assert_frame_equal(result, expected_output, check_exact=True) + + +def get_row_indices_for_invalid_column_values_test_cases(): + return [ + { + "name": "has_na_and_allowed", + "df": pd.DataFrame({"test_col": ["Val1", "Val2", float("nan"), None]}), + "col": "test_col", + "possible_values": ["Val1"], + "na_allowed": True, + "sep": None, + "expected_index": pd.Index([1]), + }, + { + "name": "has_na_but_not_allowed", + "df": pd.DataFrame({"test_col": ["Val1", "Val2", float("nan"), None]}), + "col": "test_col", + "possible_values": ["Val1"], + "na_allowed": False, + "sep": None, + "expected_index": pd.Index([1, 2, 3]), + }, + { + "name": "invalid_values_na_allowed", + "df": pd.DataFrame({"test_col": ["val1", "VAL1", float("nan"), None]}), + "col": "test_col", + "possible_values": ["Val1"], + "na_allowed": True, + "sep": None, + "expected_index": pd.Index([0, 1]), + }, + { + "name": "invalid_values_na_not_allowed", + "df": pd.DataFrame({"test_col": ["val1", "VAL1", float("nan"), None]}), + "col": "test_col", + "possible_values": ["Val1"], + "na_allowed": False, + "sep": None, + "expected_index": pd.Index([0, 1, 2, 3]), + }, + { + "name": "values_in_list", + "df": pd.DataFrame({"test_col": ["Val1;Val2", "Val1;Val2;Val3","Val1", "Val1;", "Val1;None"]}), + "col": "test_col", + "possible_values": ["Val1", "Val2"], + "na_allowed": True, + "sep": ";", + "expected_index": pd.Index([1, 3, 4]), + }, + { + "name": "valid_data", + "df": pd.DataFrame({"test_col": ["Val1", "Val2", "Val1;Val2"]}), + "col": "test_col", + "possible_values": ["Val1", "Val2"], + "na_allowed": False, + "sep": ";", + "expected_index": pd.Index([]), + }, + ] +@pytest.mark.parametrize( + "test_cases", get_row_indices_for_invalid_column_values_test_cases(), ids=lambda x: x["name"] +) +def test_get_row_indices_for_invalid_column_values(test_cases): + df = test_cases["df"] + col = test_cases["col"] + possible_values = test_cases["possible_values"] + na_allowed = test_cases["na_allowed"] + sep = test_cases["sep"] + results = process_functions.get_row_indices_for_invalid_column_values(df, col, possible_values, na_allowed, sep) + assert results.equals(test_cases["expected_index"]) + +def get_message_for_invalid_column_value_test_cases(): + return [ + { + "name": "invalid_data", + "col": "test_col", + "filename": "test_filename", + "invalid_indices": pd.Index([1, 2, 3]), + "possible_values": ["Val1"], + "expected_error": "test_filename: Please double check your test_col column. Valid values are Val1. "\ + "You have 3 row(s) in your file where test_col column contains invalid values. "\ + "The row(s) this occurs in are: [1, 2, 3]. Please correct.\n", + "expected_warning": "" + }, + { + "name": "valid_data", + "col": "test_col", + "filename": "test_filename", + "invalid_indices": pd.Index([]), + "possible_values": ["Val1", "Val2"], + "expected_error": "", + "expected_warning": "" + }, + ] +@pytest.mark.parametrize( + "test_cases", get_message_for_invalid_column_value_test_cases(), ids=lambda x: x["name"] +) +def test_get_message_for_invalid_column_value(test_cases): + col = test_cases["col"] + filename = test_cases["filename"] + invalid_indices = test_cases["invalid_indices"] + possible_values = test_cases["possible_values"] + warning, error = process_functions.get_message_for_invalid_column_value(col, filename, invalid_indices, possible_values) + assert warning == test_cases["expected_warning"] + assert error == test_cases["expected_error"] + +def check_col_and_values_row_specific_test_cases(): + return [ + { + "name": "valid_data_with_value_list", + "df": pd.DataFrame({"test_col": ["Val1", "Val2", "Val1;Val2"]}), + "col": "test_col", + "possible_values": ["Val1", "Val2"], + "filename": "test_filename", + "na_allowed": True, + "required": True, + "sep": ";", + "expected_error": "", + "expected_warning": "" + }, + { + "name": "valid_data_with_individual_value_na_allowed", + "df": pd.DataFrame({"test_col": ["Val1", "Val2", float("nan"), None]}), + "col": "test_col", + "possible_values": ["Val1", "Val2"], + "filename": "test_filename", + "na_allowed": True, + "required": True, + "sep": ";", + "expected_error": "", + "expected_warning": "" + }, + { + "name": "missing_required_column", + "df": pd.DataFrame({"test_col": ["Val1", "Val2", "Val1;Val2"]}), + "col": "test_col1", + "possible_values": ["Val1"], + "filename": "test_filename", + "na_allowed": True, + "required": True, + "sep": ";", + "expected_error": "test_filename: Must have test_col1 column.\n", + "expected_warning": "" + }, + { + "name": "missing_optional_column", + "df": pd.DataFrame({"test_col": ["Val1", "Val2", "Val1;Val2"]}), + "col": "test_col1", + "possible_values": ["Val1"], + "filename": "test_filename", + "na_allowed": True, + "required": False, + "sep": ";", + "expected_error": "", + "expected_warning": "test_filename: Doesn't have test_col1 column. This column will be added.\n" + }, + { + "name": "invalid_data_with_value_list", + "df": pd.DataFrame({"test_col": ["Val1", "Val2", "Val1;Val2"]}), + "col": "test_col", + "possible_values": ["Val1"], + "filename": "test_filename", + "na_allowed": True, + "required": True, + "sep": ";", + "expected_error": "test_filename: Please double check your test_col column. Valid values are Val1. "\ + "You have 2 row(s) in your file where test_col column contains invalid values. "\ + "The row(s) this occurs in are: [1, 2]. Please correct.\n", + "expected_warning": "" + }, + { + "name": "invalid_data_with_individual_value_na_not_allowed", + "df": pd.DataFrame({"test_col": ["Val1", "Val2", "", float("nan"), None]}), + "col": "test_col", + "possible_values": ["Val1", "Val2"], + "filename": "test_filename", + "na_allowed": False, + "required": True, + "sep": None, + "expected_error": "test_filename: Please double check your test_col column. Valid values are Val1, Val2. "\ + "You have 3 row(s) in your file where test_col column contains invalid values. "\ + "The row(s) this occurs in are: [2, 3, 4]. Please correct.\n", + "expected_warning": "" + }, + { + "name": "invalid_data_with_individual_value_na_allowed", + "df": pd.DataFrame({"test_col": ["Val1", "Val2", "", float("nan"), None]}), + "col": "test_col", + "possible_values": ["Val1"], + "filename": "test_filename", + "na_allowed": True, + "required": True, + "sep": None, + "expected_error": "test_filename: Please double check your test_col column. Valid values are Val1. "\ + "You have 2 row(s) in your file where test_col column contains invalid values. "\ + "The row(s) this occurs in are: [1, 2]. Please correct.\n", + "expected_warning": "" + }, + ] +@pytest.mark.parametrize( + "test_cases", check_col_and_values_row_specific_test_cases(), ids=lambda x: x["name"] +) +def test_check_col_and_values_row_specific(test_cases): + df = test_cases["df"] + col = test_cases["col"] + possible_values = test_cases["possible_values"] + filename = test_cases["filename"] + na_allowed = test_cases["na_allowed"] + required = test_cases["required"] + sep = test_cases["sep"] + warning, error = process_functions.check_column_and_values_row_specific(df, col, possible_values, filename, na_allowed, required, sep) + assert warning == test_cases["expected_warning"] + assert error == test_cases["expected_error"]