diff --git a/genie_registry/clinical.py b/genie_registry/clinical.py index 8ad682b3..65c59429 100644 --- a/genie_registry/clinical.py +++ b/genie_registry/clinical.py @@ -191,9 +191,7 @@ def _check_year_death_validity(clinicaldf: pd.DataFrame) -> pd.Index: # Convert YEAR_DEATH and YEAR_CONTACT to numeric, coercing errors to NaN temp["YEAR_DEATH"] = pd.to_numeric(temp["YEAR_DEATH"], errors="coerce") temp["YEAR_CONTACT"] = pd.to_numeric(temp["YEAR_CONTACT"], errors="coerce") - # Compare rows with numeric values in both columns and returns comparion results("True"/"False") - # If either of the column contains NA or nominal data (e.g. "Unknown", "Not Collected", "Unknown", "Not Applicable"), - # "N/A" will be outputed. + # Compare rows with numeric values in both YEAR_DEATH and YEAR_CONTACT columns temp["check_result"] = np.where( (pd.isna(temp["YEAR_DEATH"]) | pd.isna(temp["YEAR_CONTACT"])), "N/A", @@ -223,7 +221,58 @@ def _check_year_death_validity_message( "Patient Clinical File: Please double check your YEAR_DEATH and YEAR_CONTACT columns. " "YEAR_DEATH must be >= YEAR_CONTACT. " f"There are {len(invalid_year_death_indices)} row(s) with YEAR_DEATH < YEAR_CONTACT. " - f"Row {invalid_year_death_indices.tolist()} contain invalid values in the YEAR_DEATH field. Please correct.\n" + f"The row number(s) this occurs in are: {invalid_year_death_indices.tolist()}. Please correct.\n" + ) + return error, warning + + +def _check_int_dod_validity(clinicaldf: pd.DataFrame) -> pd.Index: + """ + INT_DOD should alway be greater than or equal to INT_CONTACT when they are both available. + This function checks if INT_DOD >= INT_CONTACT and returns row indices of invalid INT_DOD rows. + + Args: + clinicaldf: Clinical Data Frame + + Returns: + pd.Index: The row indices of the row with INT_DOD < INT_CONTACT in the input clinical data + """ + # Generate temp dataframe to handle datatype mismatch in a column + temp = clinicaldf[["INT_DOD", "INT_CONTACT"]].copy() + # Convert INT_DOD and INT_CONTACT to numeric, coercing errors to NaN + temp["INT_DOD"] = pd.to_numeric(temp["INT_DOD"], errors="coerce") + temp["INT_CONTACT"] = pd.to_numeric(temp["INT_CONTACT"], errors="coerce") + # Compare rows with numeric values in both INT_DOD and INT_CONTACT columns + temp["check_result"] = np.where( + (pd.isna(temp["INT_DOD"]) | pd.isna(temp["INT_CONTACT"])), + "N/A", + temp["INT_DOD"] >= temp["INT_CONTACT"], + ) + invalid_int_dod = temp[temp["check_result"] == "False"] + return invalid_int_dod.index + + +def _check_int_dod_validity_message( + invalid_int_dod_indices: pd.Index, +) -> Tuple[str, str]: + """This function returns the error and warning messages + if the input clinical data has row with INT_DOD < INT_CONTACT + + Args: + invalid_int_dod_indices: The row indices of the rows with INT_DOD < INT_CONTACT in the input clinical data + + Returns: + Tuple[str, str]: The error message that tells you how many patients with invalid INT_DOD values that your + input clinical data has + """ + error = "" + warning = "" + if len(invalid_int_dod_indices) > 0: + error = ( + "Patient Clinical File: Please double check your INT_DOD and INT_CONTACT columns. " + "INT_DOD must be >= INT_CONTACT. " + f"There are {len(invalid_int_dod_indices)} row(s) with INT_DOD < INT_CONTACT. " + f"The row number(s) this occurs in are: {invalid_int_dod_indices.tolist()}. Please correct.\n" ) return error, warning @@ -956,6 +1005,16 @@ def _validate(self, clinicaldf): ) else: total_error.write("Patient Clinical File: Must have DEAD column.\n") + + # CHECK: INT DOD against INT CONTACT + has_int_dod_and_contact = process_functions.checkColExist( + clinicaldf, ["INT_DOD", "INT_CONTACT"] + ) + if has_int_dod_and_contact: + invalid_int_dod_indices = _check_int_dod_validity(clinicaldf) + errors, warnings = _check_int_dod_validity_message(invalid_int_dod_indices) + total_error.write(errors) + # CHECK: contact vital status value consistency contact_error = _check_int_year_consistency( clinicaldf=clinicaldf, diff --git a/tests/test_clinical.py b/tests/test_clinical.py index 59e6f513..f25ff8fb 100644 --- a/tests/test_clinical.py +++ b/tests/test_clinical.py @@ -91,19 +91,34 @@ def valid_clinical_df(): "GENIE-SAGE-ID3", "GENIE-SAGE-ID4", "GENIE-SAGE-ID5", + "GENIE-SAGE-ID6", ], - SEX=[1, 2, 1, 2, 99], - PRIMARY_RACE=[1, 2, 3, 4, 99], - SECONDARY_RACE=[1, 2, 3, 4, 99], - TERTIARY_RACE=[1, 2, 3, 4, 99], - ETHNICITY=[1, 2, 3, 4, 99], - BIRTH_YEAR=[1222, "Unknown", 1920, 1990, 1990], - CENTER=["FOO", "FOO", "FOO", "FOO", "FOO"], - YEAR_CONTACT=["Unknown", "Not Collected", ">89", "<18", 1990], - INT_CONTACT=["Unknown", "Not Collected", ">32485", "<6570", 2000], - YEAR_DEATH=["Unknown", "Not Collected", "Unknown", "Not Applicable", "<18"], - INT_DOD=["Unknown", "Not Collected", "Unknown", "Not Applicable", "<6570"], - DEAD=["Unknown", "Not Collected", "Unknown", False, True], + SEX=[1, 2, 1, 2, 99, 99], + PRIMARY_RACE=[1, 2, 3, 4, 99, 99], + SECONDARY_RACE=[1, 2, 3, 4, 99, 99], + TERTIARY_RACE=[1, 2, 3, 4, 99, 99], + ETHNICITY=[1, 2, 3, 4, 99, 99], + BIRTH_YEAR=[1222, "Unknown", 1920, 1990, 1990, 1990], + CENTER=["FOO", "FOO", "FOO", "FOO", "FOO", "FOO"], + YEAR_CONTACT=["Unknown", "Not Collected", ">89", "<18", 1990, 1990], + INT_CONTACT=["Unknown", "Not Collected", ">32485", "<6570", 2000, 2000], + YEAR_DEATH=[ + "Unknown", + "Not Collected", + "Unknown", + "Not Applicable", + "<18", + "<18", + ], + INT_DOD=[ + "Unknown", + "Not Collected", + "Unknown", + "Not Applicable", + "<6570", + 2001, + ], + DEAD=["Unknown", "Not Collected", "Unknown", False, True, True], ) ) @@ -632,7 +647,7 @@ def test_errors__validate(clin_class): YEAR_DEATH=["Unknown", "Not Collected", "Not Applicable", 19930, 1990], YEAR_CONTACT=["Unknown", "Not Collected", 1990, 1990, 19940], INT_CONTACT=[">32485", "<6570", 1990, "Not Collected", ">foobar"], - INT_DOD=[">32485", "<6570", "Unknown", "Not Collected", "32485", "<6570", 1911, "Not Collected", "= YEAR_CONTACT. " "There are 1 row(s) with YEAR_DEATH < YEAR_CONTACT. " - "Row [4] contain invalid values in the YEAR_DEATH field. Please correct.\n" + "The row number(s) this occurs in are: [4]. Please correct.\n" "Patient Clinical File: Please double check your INT_CONTACT " "column, it must be an integer, '>32485', '<6570', 'Unknown', " "'Not Released' or 'Not Collected'.\n" @@ -711,6 +726,10 @@ def test_errors__validate(clin_class): "Patient Clinical File: Please double check your DEAD column, " "it must be True, False, 'Unknown', " "'Not Released' or 'Not Collected'.\n" + "Patient Clinical File: Please double check your INT_DOD " + "and INT_CONTACT columns. INT_DOD must be >= INT_CONTACT. " + "There are 1 row(s) with INT_DOD < INT_CONTACT. " + "The row number(s) this occurs in are: [2]. Please correct.\n" "Patient: you have inconsistent redaction and text values in " "YEAR_CONTACT, INT_CONTACT.\n" "Patient: you have inconsistent redaction and text values in " @@ -1129,7 +1148,7 @@ def test__check_year_death_validity(df, expected_indices): "Patient Clinical File: Please double check your YEAR_DEATH and YEAR_CONTACT columns. " "YEAR_DEATH must be >= YEAR_CONTACT. " "There are 2 row(s) with YEAR_DEATH < YEAR_CONTACT. " - "Row [2, 3] contain invalid values in the YEAR_DEATH field. Please correct.\n", + "The row number(s) this occurs in are: [2, 3]. Please correct.\n", ), ], ids=[ @@ -1145,6 +1164,85 @@ def test__check_year_death_validity_message(invalid_year_death_indices, expected assert warning == "" +@pytest.mark.parametrize( + "df,expected_indices", + [ + ( + pd.DataFrame({"INT_DOD": [420, 555, 390], "INT_CONTACT": [50, 40, 22]}), + [], + ), + ( + pd.DataFrame( + { + "INT_DOD": [420, float("nan"), 390], + "INT_CONTACT": [50, 40, float("nan")], + } + ), + [], + ), + ( + pd.DataFrame( + { + "INT_DOD": [float("nan"), float("nan"), 390], + "INT_CONTACT": [50, 40, float("nan")], + } + ), + [], + ), + ( + pd.DataFrame({"INT_DOD": [420, 666, 390], "INT_CONTACT": [50, 40, 555]}), + [2], + ), + ( + pd.DataFrame( + {"INT_DOD": [420, float("nan"), 390], "INT_CONTACT": [50, 40, 555]} + ), + [2], + ), + ], + ids=[ + "valid_dataframe_no_NAs", + "valid_dataframe_w_NAs", + "valid_dataframe_all_NAs", + "invalid_dataframe_no_NAs", + "invalid_dataframe_w_NAs", + ], +) +def test__check_int_dod_validity(df, expected_indices): + invalid_int_dod_indices = genie_registry.clinical._check_int_dod_validity( + clinicaldf=df + ) + assert expected_indices == invalid_int_dod_indices.tolist() + + +@pytest.mark.parametrize( + "invalid_int_dod_indices,expected_error", + [ + ( + pd.Index([]), + "", + ), + ( + pd.Index([2, 3]), + "Patient Clinical File: Please double check your INT_DOD and INT_CONTACT columns. " + "INT_DOD must be >= INT_CONTACT. " + "There are 2 row(s) with INT_DOD < INT_CONTACT. " + "The row number(s) this occurs in are: [2, 3]. Please correct.\n", + ), + ], + ids=[ + "valid_dataframe", + "invalid_dataframe", + ], +) +def test__check_int_dod_validity_message(invalid_int_dod_indices, expected_error): + error, warning = genie_registry.clinical._check_int_dod_validity_message( + invalid_int_dod_indices + ) + assert error == expected_error + assert warning == "" + + def get_cross_validate_bed_files_test_cases(): return [ {