Skip to content

Commit

Permalink
[GEN-867] Add validation rule to check if INT_DOD >= INT_CONTACT (#561)
Browse files Browse the repository at this point in the history
* add validation rule to check if INT_DOD >= INT_CONTACT
  • Loading branch information
danlu1 authored Apr 28, 2024
1 parent 2985f37 commit 4c0f625
Show file tree
Hide file tree
Showing 2 changed files with 176 additions and 19 deletions.
67 changes: 63 additions & 4 deletions genie_registry/clinical.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,9 +191,7 @@ def _check_year_death_validity(clinicaldf: pd.DataFrame) -> pd.Index:
# Convert YEAR_DEATH and YEAR_CONTACT to numeric, coercing errors to NaN
temp["YEAR_DEATH"] = pd.to_numeric(temp["YEAR_DEATH"], errors="coerce")
temp["YEAR_CONTACT"] = pd.to_numeric(temp["YEAR_CONTACT"], errors="coerce")
# Compare rows with numeric values in both columns and returns comparion results("True"/"False")
# If either of the column contains NA or nominal data (e.g. "Unknown", "Not Collected", "Unknown", "Not Applicable"),
# "N/A" will be outputed.
# Compare rows with numeric values in both YEAR_DEATH and YEAR_CONTACT columns
temp["check_result"] = np.where(
(pd.isna(temp["YEAR_DEATH"]) | pd.isna(temp["YEAR_CONTACT"])),
"N/A",
Expand Down Expand Up @@ -223,7 +221,58 @@ def _check_year_death_validity_message(
"Patient Clinical File: Please double check your YEAR_DEATH and YEAR_CONTACT columns. "
"YEAR_DEATH must be >= YEAR_CONTACT. "
f"There are {len(invalid_year_death_indices)} row(s) with YEAR_DEATH < YEAR_CONTACT. "
f"Row {invalid_year_death_indices.tolist()} contain invalid values in the YEAR_DEATH field. Please correct.\n"
f"The row number(s) this occurs in are: {invalid_year_death_indices.tolist()}. Please correct.\n"
)
return error, warning


def _check_int_dod_validity(clinicaldf: pd.DataFrame) -> pd.Index:
"""
INT_DOD should alway be greater than or equal to INT_CONTACT when they are both available.
This function checks if INT_DOD >= INT_CONTACT and returns row indices of invalid INT_DOD rows.
Args:
clinicaldf: Clinical Data Frame
Returns:
pd.Index: The row indices of the row with INT_DOD < INT_CONTACT in the input clinical data
"""
# Generate temp dataframe to handle datatype mismatch in a column
temp = clinicaldf[["INT_DOD", "INT_CONTACT"]].copy()
# Convert INT_DOD and INT_CONTACT to numeric, coercing errors to NaN
temp["INT_DOD"] = pd.to_numeric(temp["INT_DOD"], errors="coerce")
temp["INT_CONTACT"] = pd.to_numeric(temp["INT_CONTACT"], errors="coerce")
# Compare rows with numeric values in both INT_DOD and INT_CONTACT columns
temp["check_result"] = np.where(
(pd.isna(temp["INT_DOD"]) | pd.isna(temp["INT_CONTACT"])),
"N/A",
temp["INT_DOD"] >= temp["INT_CONTACT"],
)
invalid_int_dod = temp[temp["check_result"] == "False"]
return invalid_int_dod.index


def _check_int_dod_validity_message(
invalid_int_dod_indices: pd.Index,
) -> Tuple[str, str]:
"""This function returns the error and warning messages
if the input clinical data has row with INT_DOD < INT_CONTACT
Args:
invalid_int_dod_indices: The row indices of the rows with INT_DOD < INT_CONTACT in the input clinical data
Returns:
Tuple[str, str]: The error message that tells you how many patients with invalid INT_DOD values that your
input clinical data has
"""
error = ""
warning = ""
if len(invalid_int_dod_indices) > 0:
error = (
"Patient Clinical File: Please double check your INT_DOD and INT_CONTACT columns. "
"INT_DOD must be >= INT_CONTACT. "
f"There are {len(invalid_int_dod_indices)} row(s) with INT_DOD < INT_CONTACT. "
f"The row number(s) this occurs in are: {invalid_int_dod_indices.tolist()}. Please correct.\n"
)
return error, warning

Expand Down Expand Up @@ -956,6 +1005,16 @@ def _validate(self, clinicaldf):
)
else:
total_error.write("Patient Clinical File: Must have DEAD column.\n")

# CHECK: INT DOD against INT CONTACT
has_int_dod_and_contact = process_functions.checkColExist(
clinicaldf, ["INT_DOD", "INT_CONTACT"]
)
if has_int_dod_and_contact:
invalid_int_dod_indices = _check_int_dod_validity(clinicaldf)
errors, warnings = _check_int_dod_validity_message(invalid_int_dod_indices)
total_error.write(errors)

# CHECK: contact vital status value consistency
contact_error = _check_int_year_consistency(
clinicaldf=clinicaldf,
Expand Down
128 changes: 113 additions & 15 deletions tests/test_clinical.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,19 +91,34 @@ def valid_clinical_df():
"GENIE-SAGE-ID3",
"GENIE-SAGE-ID4",
"GENIE-SAGE-ID5",
"GENIE-SAGE-ID6",
],
SEX=[1, 2, 1, 2, 99],
PRIMARY_RACE=[1, 2, 3, 4, 99],
SECONDARY_RACE=[1, 2, 3, 4, 99],
TERTIARY_RACE=[1, 2, 3, 4, 99],
ETHNICITY=[1, 2, 3, 4, 99],
BIRTH_YEAR=[1222, "Unknown", 1920, 1990, 1990],
CENTER=["FOO", "FOO", "FOO", "FOO", "FOO"],
YEAR_CONTACT=["Unknown", "Not Collected", ">89", "<18", 1990],
INT_CONTACT=["Unknown", "Not Collected", ">32485", "<6570", 2000],
YEAR_DEATH=["Unknown", "Not Collected", "Unknown", "Not Applicable", "<18"],
INT_DOD=["Unknown", "Not Collected", "Unknown", "Not Applicable", "<6570"],
DEAD=["Unknown", "Not Collected", "Unknown", False, True],
SEX=[1, 2, 1, 2, 99, 99],
PRIMARY_RACE=[1, 2, 3, 4, 99, 99],
SECONDARY_RACE=[1, 2, 3, 4, 99, 99],
TERTIARY_RACE=[1, 2, 3, 4, 99, 99],
ETHNICITY=[1, 2, 3, 4, 99, 99],
BIRTH_YEAR=[1222, "Unknown", 1920, 1990, 1990, 1990],
CENTER=["FOO", "FOO", "FOO", "FOO", "FOO", "FOO"],
YEAR_CONTACT=["Unknown", "Not Collected", ">89", "<18", 1990, 1990],
INT_CONTACT=["Unknown", "Not Collected", ">32485", "<6570", 2000, 2000],
YEAR_DEATH=[
"Unknown",
"Not Collected",
"Unknown",
"Not Applicable",
"<18",
"<18",
],
INT_DOD=[
"Unknown",
"Not Collected",
"Unknown",
"Not Applicable",
"<6570",
2001,
],
DEAD=["Unknown", "Not Collected", "Unknown", False, True, True],
)
)

Expand Down Expand Up @@ -632,7 +647,7 @@ def test_errors__validate(clin_class):
YEAR_DEATH=["Unknown", "Not Collected", "Not Applicable", 19930, 1990],
YEAR_CONTACT=["Unknown", "Not Collected", 1990, 1990, 19940],
INT_CONTACT=[">32485", "<6570", 1990, "Not Collected", ">foobar"],
INT_DOD=[">32485", "<6570", "Unknown", "Not Collected", "<dense"],
INT_DOD=[">32485", "<6570", 1911, "Not Collected", "<dense"],
DEAD=[1, False, "Unknown", "Not Collected", "Not Applicable"],
)
)
Expand Down Expand Up @@ -701,7 +716,7 @@ def test_errors__validate(clin_class):
"Patient Clinical File: Please double check your YEAR_DEATH "
"and YEAR_CONTACT columns. YEAR_DEATH must be >= YEAR_CONTACT. "
"There are 1 row(s) with YEAR_DEATH < YEAR_CONTACT. "
"Row [4] contain invalid values in the YEAR_DEATH field. Please correct.\n"
"The row number(s) this occurs in are: [4]. Please correct.\n"
"Patient Clinical File: Please double check your INT_CONTACT "
"column, it must be an integer, '>32485', '<6570', 'Unknown', "
"'Not Released' or 'Not Collected'.\n"
Expand All @@ -711,6 +726,10 @@ def test_errors__validate(clin_class):
"Patient Clinical File: Please double check your DEAD column, "
"it must be True, False, 'Unknown', "
"'Not Released' or 'Not Collected'.\n"
"Patient Clinical File: Please double check your INT_DOD "
"and INT_CONTACT columns. INT_DOD must be >= INT_CONTACT. "
"There are 1 row(s) with INT_DOD < INT_CONTACT. "
"The row number(s) this occurs in are: [2]. Please correct.\n"
"Patient: you have inconsistent redaction and text values in "
"YEAR_CONTACT, INT_CONTACT.\n"
"Patient: you have inconsistent redaction and text values in "
Expand Down Expand Up @@ -1129,7 +1148,7 @@ def test__check_year_death_validity(df, expected_indices):
"Patient Clinical File: Please double check your YEAR_DEATH and YEAR_CONTACT columns. "
"YEAR_DEATH must be >= YEAR_CONTACT. "
"There are 2 row(s) with YEAR_DEATH < YEAR_CONTACT. "
"Row [2, 3] contain invalid values in the YEAR_DEATH field. Please correct.\n",
"The row number(s) this occurs in are: [2, 3]. Please correct.\n",
),
],
ids=[
Expand All @@ -1145,6 +1164,85 @@ def test__check_year_death_validity_message(invalid_year_death_indices, expected
assert warning == ""


@pytest.mark.parametrize(
"df,expected_indices",
[
(
pd.DataFrame({"INT_DOD": [420, 555, 390], "INT_CONTACT": [50, 40, 22]}),
[],
),
(
pd.DataFrame(
{
"INT_DOD": [420, float("nan"), 390],
"INT_CONTACT": [50, 40, float("nan")],
}
),
[],
),
(
pd.DataFrame(
{
"INT_DOD": [float("nan"), float("nan"), 390],
"INT_CONTACT": [50, 40, float("nan")],
}
),
[],
),
(
pd.DataFrame({"INT_DOD": [420, 666, 390], "INT_CONTACT": [50, 40, 555]}),
[2],
),
(
pd.DataFrame(
{"INT_DOD": [420, float("nan"), 390], "INT_CONTACT": [50, 40, 555]}
),
[2],
),
],
ids=[
"valid_dataframe_no_NAs",
"valid_dataframe_w_NAs",
"valid_dataframe_all_NAs",
"invalid_dataframe_no_NAs",
"invalid_dataframe_w_NAs",
],
)
def test__check_int_dod_validity(df, expected_indices):
invalid_int_dod_indices = genie_registry.clinical._check_int_dod_validity(
clinicaldf=df
)
assert expected_indices == invalid_int_dod_indices.tolist()


@pytest.mark.parametrize(
"invalid_int_dod_indices,expected_error",
[
(
pd.Index([]),
"",
),
(
pd.Index([2, 3]),
"Patient Clinical File: Please double check your INT_DOD and INT_CONTACT columns. "
"INT_DOD must be >= INT_CONTACT. "
"There are 2 row(s) with INT_DOD < INT_CONTACT. "
"The row number(s) this occurs in are: [2, 3]. Please correct.\n",
),
],
ids=[
"valid_dataframe",
"invalid_dataframe",
],
)
def test__check_int_dod_validity_message(invalid_int_dod_indices, expected_error):
error, warning = genie_registry.clinical._check_int_dod_validity_message(
invalid_int_dod_indices
)
assert error == expected_error
assert warning == ""


def get_cross_validate_bed_files_test_cases():
return [
{
Expand Down

0 comments on commit 4c0f625

Please sign in to comment.