Sage-Bionetworks · danlu1 · Apr 28, 2024 · Apr 25, 2024 · Apr 28, 2024 · BryanFauble
@@ -192,7 +192,7 @@ def _check_year_death_validity(clinicaldf: pd.DataFrame) -> pd.Index:
     temp["YEAR_DEATH"] = pd.to_numeric(temp["YEAR_DEATH"], errors="coerce")
     temp["YEAR_CONTACT"] = pd.to_numeric(temp["YEAR_CONTACT"], errors="coerce")
     # Compare rows with numeric values in both columns and returns comparion results("True"/"False")
-    # If either of the column contains NA or nominal data (e.g. "Unknown", "Not Collected", "Unknown", "Not Applicable"),
+    # If either of the column contains NA or nominal data (e.g. "Unknown", "Not Collected", "Not Applicable"),
     # "N/A" will be outputed.
     temp["check_result"] = np.where(
         (pd.isna(temp["YEAR_DEATH"]) | pd.isna(temp["YEAR_CONTACT"])),
@@ -228,6 +228,59 @@ def _check_year_death_validity_message(
     return error, warning
 
 
+def _check_int_dod_validity(clinicaldf: pd.DataFrame) -> pd.Index:
+    """
+    INT_DOD should alway be greater than or equal to INT_CONTACT when they are both available.
+    This function checks if INT_DOD >= INT_CONTACT and returns row indices of invalid INT_DOD rows.
+
+    Args:
+        clinicaldf: Clinical Data Frame
+
+    Returns:
+        pd.Index: The row indices of the row with INT_DOD < INT_CONTACT in the input clinical data
+    """
+    # Generate temp dataframe to handle datatype mismatch in a column
+    temp = clinicaldf[["INT_DOD", "INT_CONTACT"]].copy()
+    # Convert INT_DOD and INT_CONTACT to numeric, coercing errors to NaN
+    temp["INT_DOD"] = pd.to_numeric(temp["INT_DOD"], errors="coerce")
+    temp["INT_CONTACT"] = pd.to_numeric(temp["INT_CONTACT"], errors="coerce")
+    # Compare rows with numeric values in both columns and returns comparion results("True"/"False")
+    # If either of the column contains NA or nominal data (e.g. "Unknown", "Not Collected", "Not Applicable"),
+    # "N/A" will be outputed.
+    temp["check_result"] = np.where(
+        (pd.isna(temp["INT_DOD"]) | pd.isna(temp["INT_CONTACT"])),
+        "N/A",
+        temp["INT_DOD"] >= temp["INT_CONTACT"],
+    )
+    invalid_int_dod = temp[temp["check_result"] == "False"]
+    return invalid_int_dod.index
+
+
+def _check_int_dod_validity_message(
+    invalid_int_dod_indices: pd.Index,
+) -> Tuple[str, str]:
+    """This function returns the error and warning messages
+    if the input clinical data has row with INT_DOD < INT_CONTACT
+
+    Args:
+        invalid_int_dod_indices: The row indices of the rows with INT_DOD < INT_CONTACT in the input clinical data
+
+    Returns:
+        Tuple[str, str]: The error message that tells you how many patients with invalid INT_DOD values that your
+        input clinical data has
+    """
+    error = ""
+    warning = ""
+    if len(invalid_int_dod_indices) > 0:
+        error = (
+            "Patient Clinical File: Please double check your INT_DOD and INT_CONTACT columns. "
+            "INT_DOD must be >= INT_CONTACT. "
+            f"There are {len(invalid_int_dod_indices)} row(s) with INT_DOD < INT_CONTACT. "
+            f"Row {invalid_int_dod_indices.tolist()} contain invalid values in the INT_DOD field. Please correct.\n"
+        )
+    return error, warning
+
+
 # PROCESSING
 def remap_clinical_values(
     clinicaldf: pd.DataFrame,
@@ -956,6 +1009,16 @@ def _validate(self, clinicaldf):
                 )
         else:
             total_error.write("Patient Clinical File: Must have DEAD column.\n")
+
+        # CHECK: INT DOD against INT CONTACT
+        has_int_dod_and_contact = process_functions.checkColExist(
+            clinicaldf, ["INT_DOD", "INT_CONTACT"]
+        )
+        if has_int_dod_and_contact:
+            invalid_int_dod_indices = _check_int_dod_validity(clinicaldf)
+            errors, warnings = _check_int_dod_validity_message(invalid_int_dod_indices)
     def _validate_oncotree_code_mapping( 
         self: "Clinical", clinicaldf: pd.DataFrame, oncotree_mapping: pd.DataFrame 
     ) -> pd.Index: 
         """Checks that the oncotree codes in the input clinical 
         data is a valid oncotree code from the official oncotree site 
         Args: 
             clinicaldf (pd.DataFrame): clinical input data to validate 
             oncotree_mapping (pd.DataFrame): table of official oncotree 
                 mappings 
         Returns: 
             pd.Index: row indices of unmapped oncotree codes in the 
             input clinical data 
         """ 
         # Make oncotree codes uppercase (SpCC/SPCC) 
         clinicaldf["ONCOTREE_CODE"] = ( 
             clinicaldf["ONCOTREE_CODE"].astype(str).str.upper() 
         ) 
         unmapped_oncotrees = clinicaldf[ 
             (clinicaldf["ONCOTREE_CODE"] != "UNKNOWN") 
             & ~(clinicaldf["ONCOTREE_CODE"].isin(oncotree_mapping["ONCOTREE_CODE"])) 
         ] 
         return unmapped_oncotrees.index 
     def _validate_oncotree_code_mapping_message( 
         self: "Clinical", 
         clinicaldf: pd.DataFrame, 
         unmapped_oncotree_indices: pd.DataFrame, 
     ) -> Tuple[str, str]: 
         """This function returns the error and warning messages 
         if the input clinical data has row indices with unmapped 
         oncotree codes 
         Args: 
             clinicaldf (pd.DataFrame): input clinical data 
             unmapped_oncotree_indices (pd.DataFrame): row indices of the 
                 input clinical data with unmapped oncotree codes 
         Returns: 
             Tuple[str, str]: error message that tells you how many 
                 samples AND the unique unmapped oncotree codes that your 
                 input clinical data has 
         """ 
         errors = "" 
         warnings = "" 
         if len(unmapped_oncotree_indices) > 0: 
             # sort the unique unmapped oncotree codes 
             unmapped_oncotree_codes = sorted( 
                 set(clinicaldf.loc[unmapped_oncotree_indices]["ONCOTREE_CODE"]) 
             ) 
             errors = ( 
                 "Sample Clinical File: Please double check that all your " 
                 "ONCOTREE CODES exist in the mapping. You have {} samples " 
                 "that don't map. These are the codes that " 
                 "don't map: {}\n".format( 
                     len(unmapped_oncotree_indices), ",".join(unmapped_oncotree_codes) 
                 ) 
             ) 
         return errors, warnings 
     def _validate_oncotree_code_mapping( 
         self: "Clinical", clinicaldf: pd.DataFrame, oncotree_mapping: pd.DataFrame 
     ) -> pd.Index: 
         """Checks that the oncotree codes in the input clinical 
         data is a valid oncotree code from the official oncotree site 
  
         Args: 
             clinicaldf (pd.DataFrame): clinical input data to validate 
             oncotree_mapping (pd.DataFrame): table of official oncotree 
                 mappings 
  
         Returns: 
             pd.Index: row indices of unmapped oncotree codes in the 
             input clinical data 
         """ 
         # Make oncotree codes uppercase (SpCC/SPCC) 
         clinicaldf["ONCOTREE_CODE"] = ( 
             clinicaldf["ONCOTREE_CODE"].astype(str).str.upper() 
         ) 
  
         unmapped_oncotrees = clinicaldf[ 
             (clinicaldf["ONCOTREE_CODE"] != "UNKNOWN") 
             & ~(clinicaldf["ONCOTREE_CODE"].isin(oncotree_mapping["ONCOTREE_CODE"])) 
         ] 
         return unmapped_oncotrees.index 
  
     def _validate_oncotree_code_mapping_message( 
         self: "Clinical", 
         clinicaldf: pd.DataFrame, 
         unmapped_oncotree_indices: pd.DataFrame, 
     ) -> Tuple[str, str]: 
         """This function returns the error and warning messages 
         if the input clinical data has row indices with unmapped 
         oncotree codes 
  
         Args: 
             clinicaldf (pd.DataFrame): input clinical data 
             unmapped_oncotree_indices (pd.DataFrame): row indices of the 
                 input clinical data with unmapped oncotree codes 
  
         Returns: 
             Tuple[str, str]: error message that tells you how many 
                 samples AND the unique unmapped oncotree codes that your 
                 input clinical data has 
         """ 
         errors = "" 
         warnings = "" 
         if len(unmapped_oncotree_indices) > 0: 
             # sort the unique unmapped oncotree codes 
             unmapped_oncotree_codes = sorted( 
                 set(clinicaldf.loc[unmapped_oncotree_indices]["ONCOTREE_CODE"]) 
             ) 
             errors = ( 
                 "Sample Clinical File: Please double check that all your " 
                 "ONCOTREE CODES exist in the mapping. You have {} samples " 
                 "that don't map. These are the codes that " 
                 "don't map: {}\n".format( 
                     len(unmapped_oncotree_indices), ",".join(unmapped_oncotree_codes) 
                 ) 
             ) 
         return errors, warnings 
+            total_error.write(errors)
+
         # CHECK: contact vital status value consistency
         contact_error = _check_int_year_consistency(
             clinicaldf=clinicaldf,

@@ -91,19 +91,34 @@ def valid_clinical_df():
                 "GENIE-SAGE-ID3",
                 "GENIE-SAGE-ID4",
                 "GENIE-SAGE-ID5",
+                "GENIE-SAGE-ID6",
             ],
-            SEX=[1, 2, 1, 2, 99],
-            PRIMARY_RACE=[1, 2, 3, 4, 99],
-            SECONDARY_RACE=[1, 2, 3, 4, 99],
-            TERTIARY_RACE=[1, 2, 3, 4, 99],
-            ETHNICITY=[1, 2, 3, 4, 99],
-            BIRTH_YEAR=[1222, "Unknown", 1920, 1990, 1990],
-            CENTER=["FOO", "FOO", "FOO", "FOO", "FOO"],
-            YEAR_CONTACT=["Unknown", "Not Collected", ">89", "<18", 1990],
-            INT_CONTACT=["Unknown", "Not Collected", ">32485", "<6570", 2000],
-            YEAR_DEATH=["Unknown", "Not Collected", "Unknown", "Not Applicable", "<18"],
-            INT_DOD=["Unknown", "Not Collected", "Unknown", "Not Applicable", "<6570"],
-            DEAD=["Unknown", "Not Collected", "Unknown", False, True],
+            SEX=[1, 2, 1, 2, 99, 99],
+            PRIMARY_RACE=[1, 2, 3, 4, 99, 99],
+            SECONDARY_RACE=[1, 2, 3, 4, 99, 99],
+            TERTIARY_RACE=[1, 2, 3, 4, 99, 99],
+            ETHNICITY=[1, 2, 3, 4, 99, 99],
+            BIRTH_YEAR=[1222, "Unknown", 1920, 1990, 1990, 1990],
+            CENTER=["FOO", "FOO", "FOO", "FOO", "FOO", "FOO"],
+            YEAR_CONTACT=["Unknown", "Not Collected", ">89", "<18", 1990, 1990],
+            INT_CONTACT=["Unknown", "Not Collected", ">32485", "<6570", 2000, 2000],
+            YEAR_DEATH=[
+                "Unknown",
+                "Not Collected",
+                "Unknown",
+                "Not Applicable",
+                "<18",
+                "<18",
+            ],
+            INT_DOD=[
+                "Unknown",
+                "Not Collected",
+                "Unknown",
+                "Not Applicable",
+                "<6570",
+                2001,
+            ],
+            DEAD=["Unknown", "Not Collected", "Unknown", False, True, True],
         )
     )
 
@@ -632,7 +647,7 @@ def test_errors__validate(clin_class):
             YEAR_DEATH=["Unknown", "Not Collected", "Not Applicable", 19930, 1990],
             YEAR_CONTACT=["Unknown", "Not Collected", 1990, 1990, 19940],
             INT_CONTACT=[">32485", "<6570", 1990, "Not Collected", ">foobar"],
-            INT_DOD=[">32485", "<6570", "Unknown", "Not Collected", "<dense"],
+            INT_DOD=[">32485", "<6570", 1911, "Not Collected", "<dense"],
             DEAD=[1, False, "Unknown", "Not Collected", "Not Applicable"],
         )
     )
@@ -711,6 +726,10 @@ def test_errors__validate(clin_class):
             "Patient Clinical File: Please double check your DEAD column, "
             "it must be True, False, 'Unknown', "
             "'Not Released' or 'Not Collected'.\n"
+            "Patient Clinical File: Please double check your INT_DOD "
+            "and INT_CONTACT columns. INT_DOD must be >= INT_CONTACT. "
+            "There are 1 row(s) with INT_DOD < INT_CONTACT. "
+            "Row [2] contain invalid values in the INT_DOD field. Please correct.\n"
             "Patient: you have inconsistent redaction and text values in "
             "YEAR_CONTACT, INT_CONTACT.\n"
             "Patient: you have inconsistent redaction and text values in "
@@ -1145,6 +1164,85 @@ def test__check_year_death_validity_message(invalid_year_death_indices, expected
     assert warning == ""
 
 
+@pytest.mark.parametrize(
+    "df,expected_indices",
+    [
+        (
+            pd.DataFrame({"INT_DOD": [420, 555, 390], "INT_CONTACT": [50, 40, 22]}),
+            [],
+        ),
+        (
+            pd.DataFrame(
+                {
+                    "INT_DOD": [420, float("nan"), 390],
+                    "INT_CONTACT": [50, 40, float("nan")],
+                }
+            ),
+            [],
+        ),
+        (
+            pd.DataFrame(
+                {
+                    "INT_DOD": [float("nan"), float("nan"), 390],
+                    "INT_CONTACT": [50, 40, float("nan")],
+                }
+            ),
+            [],
+        ),
+        (
+            pd.DataFrame({"INT_DOD": [420, 666, 390], "INT_CONTACT": [50, 40, 555]}),
+            [2],
+        ),
+        (
+            pd.DataFrame(
+                {"INT_DOD": [420, float("nan"), 390], "INT_CONTACT": [50, 40, 555]}
+            ),
+            [2],
+        ),
+    ],
+    ids=[
+        "valid_dataframe_no_NAs",
+        "valid_dataframe_w_NAs",
+        "valid_dataframe_all_NAs",
+        "invalid_dataframe_no_NAs",
+        "invalid_dataframe_w_NAs",
+    ],
+)
+def test__check_int_dod_validity(df, expected_indices):
+    invalid_int_dod_indices = genie_registry.clinical._check_int_dod_validity(
+        clinicaldf=df
+    )
+    assert expected_indices == invalid_int_dod_indices.tolist()
+
+
+@pytest.mark.parametrize(
+    "invalid_int_dod_indices,expected_error",
+    [
+        (
+            pd.Index([]),
+            "",
+        ),
+        (
+            pd.Index([2, 3]),
+            "Patient Clinical File: Please double check your INT_DOD and INT_CONTACT columns. "
+            "INT_DOD must be >= INT_CONTACT. "
+            "There are 2 row(s) with INT_DOD < INT_CONTACT. "
+            "Row [2, 3] contain invalid values in the INT_DOD field. Please correct.\n",
+        ),
+    ],
+    ids=[
+        "valid_dataframe",
+        "invalid_dataframe",
+    ],
+)
+def test__check_int_dod_validity_message(invalid_int_dod_indices, expected_error):
+    error, warning = genie_registry.clinical._check_int_dod_validity_message(
+        invalid_int_dod_indices
+    )
+    assert error == expected_error
+    assert warning == ""
+
+
 def get_cross_validate_bed_files_test_cases():
     return [
         {