Skip to content

Commit

Permalink
introduce row specific validation
Browse files Browse the repository at this point in the history
  • Loading branch information
danlu1 committed Nov 13, 2024
1 parent a6928d2 commit d95d684
Show file tree
Hide file tree
Showing 3 changed files with 318 additions and 20 deletions.
90 changes: 90 additions & 0 deletions genie/process_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -980,3 +980,93 @@ def create_missing_columns(dataset: pd.DataFrame, schema: dict) -> pd.Series:
elif data_type == "boolean":
dataset[column] = dataset[column].astype(pd.BooleanDtype())
return dataset[list(schema.keys())]


def get_row_indices_for_invalid_column_values(df: pd.DataFrame, col: str, possible_values: list, na_allowed: bool = False, sep: Optional[str] = None) -> pd.Index:
"""This function checks the column values against possible_values and returns row indices of invalid rows.
Currently, this function is only used in assay.py
Args:
df (pd.DataFrame): Input dataframe
col (str): The column to be checked
possible_values (list): The list of possible values
na_allowed (bool, optional): If NA is allowed. Defaults to False.
sep (Optional[str], optional): The string separator. Defaults to None.
Returns:
pd.Index: The row indices of the rows with values that are not in possible_values.
"""
if na_allowed:
# this is only useful for dropping NAs for individual values rather than value_list
check_values = df[col].dropna()
else:
check_values = df[col]
if sep:
# for columns contain lists of values
check_values = check_values.apply(lambda x: all(substring in possible_values for substring in x.split(sep)))
else:
check_values = check_values.apply(lambda x: x in possible_values)
return check_values[check_values == False].index

def get_message_for_invalid_column_value(col: str, filename: str, invalid_indices: pd.Index, possible_values: list) -> tuple:
"""This function returns the error and warning messages if the target column has rows with invalid values.
Currently, this function is only used in assay.py
Args:
col (str): The column to be checked
filename (str): The file name
invalid_indices (pd.Index): The row indices of the rows with invalid values
possible_values (list): The list of possible values
Returns:
tuple: warning, error
"""
warning = ""
error = ""
# check the validity of values in the column
# concatenated possible values. This is done because of pandas typing. An integer column with one NA/blank value will be cast as a double.
possible_values = ", ".join([str(value).replace(".0", "")for value in possible_values])
if len(invalid_indices) > 0:
error = (f"{filename}: Please double check your {col} column. Valid values are {possible_values}. "
f"You have {len(invalid_indices)} row(s) in your file where {col} column contains invalid values. "
f"The row(s) this occurs in are: {invalid_indices.tolist()}. Please correct.\n")
return (warning, error)


def check_column_and_values_row_specific(df: pd.DataFrame, col: str, possible_values: list, filename: str, na_allowed: bool = False, required=False, sep: Optional[str] = None) -> tuple:
"""This function checks if the column exists and checks if the values in the column have the valid values.
Currently, this function is only used in assay.py
Args:
df (pd.DataFrame): Input dataframe
col (str): The column to be checked
possible_values (list): The list of possible values
filename (str): The file name
na_allowed (bool, optional): If NA is allowed. Defaults to False.
required (bool, optional): If the column is required. Defaults to False.
sep (Optional[str], optional): The string separator. Defaults to None.
Returns:
tuple: warning, error
"""
warning = ""
error = ""
# check the existence of the column
have_column = checkColExist(df, col)
if not have_column:
if required:
error = "{filename}: Must have {col} column.\n".format(
filename=filename, col=col
)
else:
warning = (
"{filename}: Doesn't have {col} column. "
"This column will be added.\n".format(filename=filename, col=col)
)
else:
# get the row indices
invalid_indices = get_row_indices_for_invalid_column_values(df, col, possible_values, na_allowed, sep)
# generate validation message
warning, error = get_message_for_invalid_column_value(col, filename, invalid_indices, possible_values)

return (warning, error)
22 changes: 11 additions & 11 deletions genie_registry/assay.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ def _validate(self, assay_info_df):

read_group_dict = process_functions.get_gdc_data_dictionary("read_group")
read_group_headers = read_group_dict["properties"]
warn, error = process_functions.check_col_and_values(
warn, error = process_functions.check_column_and_values_row_specific(
assay_info_df,
"is_paired_end",
[True, False],
Expand All @@ -188,7 +188,7 @@ def _validate(self, assay_info_df):
warning += warn
total_error += error

warn, error = process_functions.check_col_and_values(
warn, error = process_functions.check_column_and_values_row_specific(
assay_info_df,
"library_selection",
read_group_headers["library_selection"]["enum"],
Expand All @@ -198,7 +198,7 @@ def _validate(self, assay_info_df):
warning += warn
total_error += error

warn, error = process_functions.check_col_and_values(
warn, error = process_functions.check_column_and_values_row_specific(
assay_info_df,
"library_strategy",
["Targeted Sequencing", "WXS"],
Expand All @@ -208,7 +208,7 @@ def _validate(self, assay_info_df):
warning += warn
total_error += error

warn, error = process_functions.check_col_and_values(
warn, error = process_functions.check_column_and_values_row_specific(
assay_info_df,
"platform",
read_group_headers["platform"]["enum"],
Expand All @@ -220,7 +220,7 @@ def _validate(self, assay_info_df):

instrument_model = read_group_headers["instrument_model"]["enum"]
instrument_model.extend(["Illumina NovaSeq 6000", None])
warn, error = process_functions.check_col_and_values(
warn, error = process_functions.check_column_and_values_row_specific(
assay_info_df,
"instrument_model",
instrument_model,
Expand All @@ -231,7 +231,7 @@ def _validate(self, assay_info_df):
total_error += error

# target_capture_kit = read_group_headers['target_capture_kit']['enum']
# warn, error = process_functions.check_col_and_values(
# warn, error = process_functions.check_column_and_values_row_specific(
# assay_info_df,
# 'target_capture_kit',
# target_capture_kit,
Expand Down Expand Up @@ -266,7 +266,7 @@ def _validate(self, assay_info_df):
"3'Flank",
None,
]
warn, error = process_functions.check_col_and_values(
warn, error = process_functions.check_column_and_values_row_specific(
assay_info_df,
"variant_classifications",
variant_classes,
Expand Down Expand Up @@ -329,7 +329,7 @@ def _validate(self, assay_info_df):
"gene_padding is by default 10 if not specified.\n"
)

warn, error = process_functions.check_col_and_values(
warn, error = process_functions.check_column_and_values_row_specific(
assay_info_df,
"calling_strategy",
["tumor_only", "tumor_normal", "plasma_normal"],
Expand Down Expand Up @@ -364,7 +364,7 @@ def _validate(self, assay_info_df):
"intragenic_cna",
"structural_variants",
]
warn, error = process_functions.check_col_and_values(
warn, error = process_functions.check_column_and_values_row_specific(
assay_info_df,
"alteration_types",
alteration_types,
Expand All @@ -376,7 +376,7 @@ def _validate(self, assay_info_df):
total_error += error

preservation_technique = ["FFPE", "fresh_frozen", "NA"]
warn, error = process_functions.check_col_and_values(
warn, error = process_functions.check_column_and_values_row_specific(
assay_info_df,
"preservation_technique",
preservation_technique,
Expand All @@ -388,7 +388,7 @@ def _validate(self, assay_info_df):
total_error += error

coverage = ["hotspot_regions", "coding_exons", "introns", "promoters"]
warn, error = process_functions.check_col_and_values(
warn, error = process_functions.check_column_and_values_row_specific(
assay_info_df,
"coverage",
coverage,
Expand Down
Loading

0 comments on commit d95d684

Please sign in to comment.