[GEN-867] Add validation rule to check if INT_DOD >= INT_CONTACT (#561)

* add validation rule to check if INT_DOD >= INT_CONTACT
Sage-Bionetworks · Apr 28, 2024 · 4c0f625 · 4c0f625
1 parent 2985f37
commit 4c0f625
Show file tree

Hide file tree

Showing 2 changed files with 176 additions and 19 deletions.
diff --git a/genie_registry/clinical.py b/genie_registry/clinical.py
@@ -191,9 +191,7 @@ def _check_year_death_validity(clinicaldf: pd.DataFrame) -> pd.Index:
     # Convert YEAR_DEATH and YEAR_CONTACT to numeric, coercing errors to NaN
     temp["YEAR_DEATH"] = pd.to_numeric(temp["YEAR_DEATH"], errors="coerce")
     temp["YEAR_CONTACT"] = pd.to_numeric(temp["YEAR_CONTACT"], errors="coerce")
-    # Compare rows with numeric values in both columns and returns comparion results("True"/"False")
-    # If either of the column contains NA or nominal data (e.g. "Unknown", "Not Collected", "Unknown", "Not Applicable"),
-    # "N/A" will be outputed.
+    # Compare rows with numeric values in both YEAR_DEATH and YEAR_CONTACT columns
     temp["check_result"] = np.where(
         (pd.isna(temp["YEAR_DEATH"]) | pd.isna(temp["YEAR_CONTACT"])),
         "N/A",
@@ -223,7 +221,58 @@ def _check_year_death_validity_message(
             "Patient Clinical File: Please double check your YEAR_DEATH and YEAR_CONTACT columns. "
             "YEAR_DEATH must be >= YEAR_CONTACT. "
             f"There are {len(invalid_year_death_indices)} row(s) with YEAR_DEATH < YEAR_CONTACT. "
-            f"Row {invalid_year_death_indices.tolist()} contain invalid values in the YEAR_DEATH field. Please correct.\n"
+            f"The row number(s) this occurs in are: {invalid_year_death_indices.tolist()}. Please correct.\n"
+        )
+    return error, warning
+
+
+def _check_int_dod_validity(clinicaldf: pd.DataFrame) -> pd.Index:
+    """
+    INT_DOD should alway be greater than or equal to INT_CONTACT when they are both available.
+    This function checks if INT_DOD >= INT_CONTACT and returns row indices of invalid INT_DOD rows.
+
+    Args:
+        clinicaldf: Clinical Data Frame
+
+    Returns:
+        pd.Index: The row indices of the row with INT_DOD < INT_CONTACT in the input clinical data
+    """
+    # Generate temp dataframe to handle datatype mismatch in a column
+    temp = clinicaldf[["INT_DOD", "INT_CONTACT"]].copy()
+    # Convert INT_DOD and INT_CONTACT to numeric, coercing errors to NaN
+    temp["INT_DOD"] = pd.to_numeric(temp["INT_DOD"], errors="coerce")
+    temp["INT_CONTACT"] = pd.to_numeric(temp["INT_CONTACT"], errors="coerce")
+    # Compare rows with numeric values in both INT_DOD and INT_CONTACT columns
+    temp["check_result"] = np.where(
+        (pd.isna(temp["INT_DOD"]) | pd.isna(temp["INT_CONTACT"])),
+        "N/A",
+        temp["INT_DOD"] >= temp["INT_CONTACT"],
+    )
+    invalid_int_dod = temp[temp["check_result"] == "False"]
+    return invalid_int_dod.index
+
+
+def _check_int_dod_validity_message(
+    invalid_int_dod_indices: pd.Index,
+) -> Tuple[str, str]:
+    """This function returns the error and warning messages
+    if the input clinical data has row with INT_DOD < INT_CONTACT
+
+    Args:
+        invalid_int_dod_indices: The row indices of the rows with INT_DOD < INT_CONTACT in the input clinical data
+
+    Returns:
+        Tuple[str, str]: The error message that tells you how many patients with invalid INT_DOD values that your
+        input clinical data has
+    """
+    error = ""
+    warning = ""
+    if len(invalid_int_dod_indices) > 0:
+        error = (
+            "Patient Clinical File: Please double check your INT_DOD and INT_CONTACT columns. "
+            "INT_DOD must be >= INT_CONTACT. "
+            f"There are {len(invalid_int_dod_indices)} row(s) with INT_DOD < INT_CONTACT. "
+            f"The row number(s) this occurs in are: {invalid_int_dod_indices.tolist()}. Please correct.\n"
         )
     return error, warning
 
@@ -956,6 +1005,16 @@ def _validate(self, clinicaldf):
                 )
         else:
             total_error.write("Patient Clinical File: Must have DEAD column.\n")
+
+        # CHECK: INT DOD against INT CONTACT
+        has_int_dod_and_contact = process_functions.checkColExist(
+            clinicaldf, ["INT_DOD", "INT_CONTACT"]
+        )
+        if has_int_dod_and_contact:
+            invalid_int_dod_indices = _check_int_dod_validity(clinicaldf)
+            errors, warnings = _check_int_dod_validity_message(invalid_int_dod_indices)
+            total_error.write(errors)
+
         # CHECK: contact vital status value consistency
         contact_error = _check_int_year_consistency(
             clinicaldf=clinicaldf,

diff --git a/tests/test_clinical.py b/tests/test_clinical.py
@@ -91,19 +91,34 @@ def valid_clinical_df():
                 "GENIE-SAGE-ID3",
                 "GENIE-SAGE-ID4",
                 "GENIE-SAGE-ID5",
+                "GENIE-SAGE-ID6",
             ],
-            SEX=[1, 2, 1, 2, 99],
-            PRIMARY_RACE=[1, 2, 3, 4, 99],
-            SECONDARY_RACE=[1, 2, 3, 4, 99],
-            TERTIARY_RACE=[1, 2, 3, 4, 99],
-            ETHNICITY=[1, 2, 3, 4, 99],
-            BIRTH_YEAR=[1222, "Unknown", 1920, 1990, 1990],
-            CENTER=["FOO", "FOO", "FOO", "FOO", "FOO"],
-            YEAR_CONTACT=["Unknown", "Not Collected", ">89", "<18", 1990],
-            INT_CONTACT=["Unknown", "Not Collected", ">32485", "<6570", 2000],
-            YEAR_DEATH=["Unknown", "Not Collected", "Unknown", "Not Applicable", "<18"],
-            INT_DOD=["Unknown", "Not Collected", "Unknown", "Not Applicable", "<6570"],
-            DEAD=["Unknown", "Not Collected", "Unknown", False, True],
+            SEX=[1, 2, 1, 2, 99, 99],
+            PRIMARY_RACE=[1, 2, 3, 4, 99, 99],
+            SECONDARY_RACE=[1, 2, 3, 4, 99, 99],
+            TERTIARY_RACE=[1, 2, 3, 4, 99, 99],
+            ETHNICITY=[1, 2, 3, 4, 99, 99],
+            BIRTH_YEAR=[1222, "Unknown", 1920, 1990, 1990, 1990],
+            CENTER=["FOO", "FOO", "FOO", "FOO", "FOO", "FOO"],
+            YEAR_CONTACT=["Unknown", "Not Collected", ">89", "<18", 1990, 1990],
+            INT_CONTACT=["Unknown", "Not Collected", ">32485", "<6570", 2000, 2000],
+            YEAR_DEATH=[
+                "Unknown",
+                "Not Collected",
+                "Unknown",
+                "Not Applicable",
+                "<18",
+                "<18",
+            ],
+            INT_DOD=[
+                "Unknown",
+                "Not Collected",
+                "Unknown",
+                "Not Applicable",
+                "<6570",
+                2001,
+            ],
+            DEAD=["Unknown", "Not Collected", "Unknown", False, True, True],
         )
     )
 
@@ -632,7 +647,7 @@ def test_errors__validate(clin_class):
             YEAR_DEATH=["Unknown", "Not Collected", "Not Applicable", 19930, 1990],
             YEAR_CONTACT=["Unknown", "Not Collected", 1990, 1990, 19940],
             INT_CONTACT=[">32485", "<6570", 1990, "Not Collected", ">foobar"],
-            INT_DOD=[">32485", "<6570", "Unknown", "Not Collected", "<dense"],
+            INT_DOD=[">32485", "<6570", 1911, "Not Collected", "<dense"],
             DEAD=[1, False, "Unknown", "Not Collected", "Not Applicable"],
         )
     )
@@ -701,7 +716,7 @@ def test_errors__validate(clin_class):
             "Patient Clinical File: Please double check your YEAR_DEATH "
             "and YEAR_CONTACT columns. YEAR_DEATH must be >= YEAR_CONTACT. "
             "There are 1 row(s) with YEAR_DEATH < YEAR_CONTACT. "
-            "Row [4] contain invalid values in the YEAR_DEATH field. Please correct.\n"
+            "The row number(s) this occurs in are: [4]. Please correct.\n"
             "Patient Clinical File: Please double check your INT_CONTACT "
             "column, it must be an integer, '>32485', '<6570', 'Unknown', "
             "'Not Released' or 'Not Collected'.\n"
@@ -711,6 +726,10 @@ def test_errors__validate(clin_class):
             "Patient Clinical File: Please double check your DEAD column, "
             "it must be True, False, 'Unknown', "
             "'Not Released' or 'Not Collected'.\n"
+            "Patient Clinical File: Please double check your INT_DOD "
+            "and INT_CONTACT columns. INT_DOD must be >= INT_CONTACT. "
+            "There are 1 row(s) with INT_DOD < INT_CONTACT. "
+            "The row number(s) this occurs in are: [2]. Please correct.\n"
             "Patient: you have inconsistent redaction and text values in "
             "YEAR_CONTACT, INT_CONTACT.\n"
             "Patient: you have inconsistent redaction and text values in "
@@ -1129,7 +1148,7 @@ def test__check_year_death_validity(df, expected_indices):
             "Patient Clinical File: Please double check your YEAR_DEATH and YEAR_CONTACT columns. "
             "YEAR_DEATH must be >= YEAR_CONTACT. "
             "There are 2 row(s) with YEAR_DEATH < YEAR_CONTACT. "
-            "Row [2, 3] contain invalid values in the YEAR_DEATH field. Please correct.\n",
+            "The row number(s) this occurs in are: [2, 3]. Please correct.\n",
         ),
     ],
     ids=[
@@ -1145,6 +1164,85 @@ def test__check_year_death_validity_message(invalid_year_death_indices, expected
     assert warning == ""
 
 
+@pytest.mark.parametrize(
+    "df,expected_indices",
+    [
+        (
+            pd.DataFrame({"INT_DOD": [420, 555, 390], "INT_CONTACT": [50, 40, 22]}),
+            [],
+        ),
+        (
+            pd.DataFrame(
+                {
+                    "INT_DOD": [420, float("nan"), 390],
+                    "INT_CONTACT": [50, 40, float("nan")],
+                }
+            ),
+            [],
+        ),
+        (
+            pd.DataFrame(
+                {
+                    "INT_DOD": [float("nan"), float("nan"), 390],
+                    "INT_CONTACT": [50, 40, float("nan")],
+                }
+            ),
+            [],
+        ),
+        (
+            pd.DataFrame({"INT_DOD": [420, 666, 390], "INT_CONTACT": [50, 40, 555]}),
+            [2],
+        ),
+        (
+            pd.DataFrame(
+                {"INT_DOD": [420, float("nan"), 390], "INT_CONTACT": [50, 40, 555]}
+            ),
+            [2],
+        ),
+    ],
+    ids=[
+        "valid_dataframe_no_NAs",
+        "valid_dataframe_w_NAs",
+        "valid_dataframe_all_NAs",
+        "invalid_dataframe_no_NAs",
+        "invalid_dataframe_w_NAs",
+    ],
+)
+def test__check_int_dod_validity(df, expected_indices):
+    invalid_int_dod_indices = genie_registry.clinical._check_int_dod_validity(
+        clinicaldf=df
+    )
+    assert expected_indices == invalid_int_dod_indices.tolist()
+
+
+@pytest.mark.parametrize(
+    "invalid_int_dod_indices,expected_error",
+    [
+        (
+            pd.Index([]),
+            "",
+        ),
+        (
+            pd.Index([2, 3]),
+            "Patient Clinical File: Please double check your INT_DOD and INT_CONTACT columns. "
+            "INT_DOD must be >= INT_CONTACT. "
+            "There are 2 row(s) with INT_DOD < INT_CONTACT. "
+            "The row number(s) this occurs in are: [2, 3]. Please correct.\n",
+        ),
+    ],
+    ids=[
+        "valid_dataframe",
+        "invalid_dataframe",
+    ],
+)
+def test__check_int_dod_validity_message(invalid_int_dod_indices, expected_error):
+    error, warning = genie_registry.clinical._check_int_dod_validity_message(
+        invalid_int_dod_indices
+    )
+    assert error == expected_error
+    assert warning == ""
+
+
 def get_cross_validate_bed_files_test_cases():
     return [
         {