Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GEN-867] Add validation rule to check if INT_DOD >= INT_CONTACT #561

Merged
merged 2 commits into from
Apr 28, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 64 additions & 1 deletion genie_registry/clinical.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ def _check_year_death_validity(clinicaldf: pd.DataFrame) -> pd.Index:
temp["YEAR_DEATH"] = pd.to_numeric(temp["YEAR_DEATH"], errors="coerce")
temp["YEAR_CONTACT"] = pd.to_numeric(temp["YEAR_CONTACT"], errors="coerce")
# Compare rows with numeric values in both columns and returns comparion results("True"/"False")
# If either of the column contains NA or nominal data (e.g. "Unknown", "Not Collected", "Unknown", "Not Applicable"),
# If either of the column contains NA or nominal data (e.g. "Unknown", "Not Collected", "Not Applicable"),
# "N/A" will be outputed.
temp["check_result"] = np.where(
(pd.isna(temp["YEAR_DEATH"]) | pd.isna(temp["YEAR_CONTACT"])),
Expand Down Expand Up @@ -228,6 +228,59 @@ def _check_year_death_validity_message(
return error, warning


def _check_int_dod_validity(clinicaldf: pd.DataFrame) -> pd.Index:
"""
INT_DOD should alway be greater than or equal to INT_CONTACT when they are both available.
This function checks if INT_DOD >= INT_CONTACT and returns row indices of invalid INT_DOD rows.

Args:
clinicaldf: Clinical Data Frame

Returns:
pd.Index: The row indices of the row with INT_DOD < INT_CONTACT in the input clinical data
"""
# Generate temp dataframe to handle datatype mismatch in a column

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Try to limit the amount of in-line comments you add. In a lot of cases it has the unintended affect of reducing readability. The code you wrote does well at telling us what the code is doing. That should be the source of truth.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure. Will do.

temp = clinicaldf[["INT_DOD", "INT_CONTACT"]].copy()
# Convert INT_DOD and INT_CONTACT to numeric, coercing errors to NaN
temp["INT_DOD"] = pd.to_numeric(temp["INT_DOD"], errors="coerce")
temp["INT_CONTACT"] = pd.to_numeric(temp["INT_CONTACT"], errors="coerce")
# Compare rows with numeric values in both columns and returns comparion results("True"/"False")

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This inline comment is misleading. You are only checking for False.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmmm let me rephrase this.

# If either of the column contains NA or nominal data (e.g. "Unknown", "Not Collected", "Not Applicable"),
# "N/A" will be outputed.
temp["check_result"] = np.where(
(pd.isna(temp["INT_DOD"]) | pd.isna(temp["INT_CONTACT"])),
"N/A",
temp["INT_DOD"] >= temp["INT_CONTACT"],
)
invalid_int_dod = temp[temp["check_result"] == "False"]
return invalid_int_dod.index


def _check_int_dod_validity_message(
invalid_int_dod_indices: pd.Index,

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the type here a list of pd.Index? The length check below suggests it's an iterator of values.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might also just be the pandas specific way of doing things that is not super intuitive

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The type is pandas.core.indexes.range.RangeIndex. We can either use len or size.
Screenshot 2024-04-26 at 11 31 37 AM

) -> Tuple[str, str]:
"""This function returns the error and warning messages
if the input clinical data has row with INT_DOD < INT_CONTACT

Args:
invalid_int_dod_indices: The row indices of the rows with INT_DOD < INT_CONTACT in the input clinical data

Returns:
Tuple[str, str]: The error message that tells you how many patients with invalid INT_DOD values that your
input clinical data has
"""
error = ""
warning = ""
if len(invalid_int_dod_indices) > 0:
error = (
"Patient Clinical File: Please double check your INT_DOD and INT_CONTACT columns. "
"INT_DOD must be >= INT_CONTACT. "
f"There are {len(invalid_int_dod_indices)} row(s) with INT_DOD < INT_CONTACT. "
f"Row {invalid_int_dod_indices.tolist()} contain invalid values in the INT_DOD field. Please correct.\n"
Copy link
Contributor

@rxu17 rxu17 Apr 26, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This part of the message is bit misleading because it implies that the INT_DOD field is the issue when it could be INT_CONTACT. You could rephrase this part of the error message to be like:
The row number(s) this occurs in are: {invalid_year_death_indices.tolist()}. Please correct.

I think this is happening for the YEAR_DEATH AND YEAR_CONTACT validation rule function too.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

^ Please fix the above in this PR as well. Thanks!

)
return error, warning


# PROCESSING
def remap_clinical_values(
clinicaldf: pd.DataFrame,
Expand Down Expand Up @@ -956,6 +1009,16 @@ def _validate(self, clinicaldf):
)
else:
total_error.write("Patient Clinical File: Must have DEAD column.\n")

# CHECK: INT DOD against INT CONTACT
has_int_dod_and_contact = process_functions.checkColExist(
clinicaldf, ["INT_DOD", "INT_CONTACT"]
)
if has_int_dod_and_contact:
invalid_int_dod_indices = _check_int_dod_validity(clinicaldf)
errors, warnings = _check_int_dod_validity_message(invalid_int_dod_indices)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit:
errors, _
As warnings is not used

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is adapted from

def _validate_oncotree_code_mapping(
self: "Clinical", clinicaldf: pd.DataFrame, oncotree_mapping: pd.DataFrame
) -> pd.Index:
"""Checks that the oncotree codes in the input clinical
data is a valid oncotree code from the official oncotree site
Args:
clinicaldf (pd.DataFrame): clinical input data to validate
oncotree_mapping (pd.DataFrame): table of official oncotree
mappings
Returns:
pd.Index: row indices of unmapped oncotree codes in the
input clinical data
"""
# Make oncotree codes uppercase (SpCC/SPCC)
clinicaldf["ONCOTREE_CODE"] = (
clinicaldf["ONCOTREE_CODE"].astype(str).str.upper()
)
unmapped_oncotrees = clinicaldf[
(clinicaldf["ONCOTREE_CODE"] != "UNKNOWN")
& ~(clinicaldf["ONCOTREE_CODE"].isin(oncotree_mapping["ONCOTREE_CODE"]))
]
return unmapped_oncotrees.index
def _validate_oncotree_code_mapping_message(
self: "Clinical",
clinicaldf: pd.DataFrame,
unmapped_oncotree_indices: pd.DataFrame,
) -> Tuple[str, str]:
"""This function returns the error and warning messages
if the input clinical data has row indices with unmapped
oncotree codes
Args:
clinicaldf (pd.DataFrame): input clinical data
unmapped_oncotree_indices (pd.DataFrame): row indices of the
input clinical data with unmapped oncotree codes
Returns:
Tuple[str, str]: error message that tells you how many
samples AND the unique unmapped oncotree codes that your
input clinical data has
"""
errors = ""
warnings = ""
if len(unmapped_oncotree_indices) > 0:
# sort the unique unmapped oncotree codes
unmapped_oncotree_codes = sorted(
set(clinicaldf.loc[unmapped_oncotree_indices]["ONCOTREE_CODE"])
)
errors = (
"Sample Clinical File: Please double check that all your "
"ONCOTREE CODES exist in the mapping. You have {} samples "
"that don't map. These are the codes that "
"don't map: {}\n".format(
len(unmapped_oncotree_indices), ",".join(unmapped_oncotree_codes)
)
)
return errors, warnings
and we might need to output warnings in the future.

total_error.write(errors)

# CHECK: contact vital status value consistency
contact_error = _check_int_year_consistency(
clinicaldf=clinicaldf,
Expand Down
124 changes: 111 additions & 13 deletions tests/test_clinical.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,19 +91,34 @@ def valid_clinical_df():
"GENIE-SAGE-ID3",
"GENIE-SAGE-ID4",
"GENIE-SAGE-ID5",
"GENIE-SAGE-ID6",
],
SEX=[1, 2, 1, 2, 99],
PRIMARY_RACE=[1, 2, 3, 4, 99],
SECONDARY_RACE=[1, 2, 3, 4, 99],
TERTIARY_RACE=[1, 2, 3, 4, 99],
ETHNICITY=[1, 2, 3, 4, 99],
BIRTH_YEAR=[1222, "Unknown", 1920, 1990, 1990],
CENTER=["FOO", "FOO", "FOO", "FOO", "FOO"],
YEAR_CONTACT=["Unknown", "Not Collected", ">89", "<18", 1990],
INT_CONTACT=["Unknown", "Not Collected", ">32485", "<6570", 2000],
YEAR_DEATH=["Unknown", "Not Collected", "Unknown", "Not Applicable", "<18"],
INT_DOD=["Unknown", "Not Collected", "Unknown", "Not Applicable", "<6570"],
DEAD=["Unknown", "Not Collected", "Unknown", False, True],
SEX=[1, 2, 1, 2, 99, 99],
PRIMARY_RACE=[1, 2, 3, 4, 99, 99],
SECONDARY_RACE=[1, 2, 3, 4, 99, 99],
TERTIARY_RACE=[1, 2, 3, 4, 99, 99],
ETHNICITY=[1, 2, 3, 4, 99, 99],
BIRTH_YEAR=[1222, "Unknown", 1920, 1990, 1990, 1990],
CENTER=["FOO", "FOO", "FOO", "FOO", "FOO", "FOO"],
YEAR_CONTACT=["Unknown", "Not Collected", ">89", "<18", 1990, 1990],
INT_CONTACT=["Unknown", "Not Collected", ">32485", "<6570", 2000, 2000],
YEAR_DEATH=[
"Unknown",
"Not Collected",
"Unknown",
"Not Applicable",
"<18",
"<18",
],
INT_DOD=[
"Unknown",
"Not Collected",
"Unknown",
"Not Applicable",
"<6570",
2001,
],
DEAD=["Unknown", "Not Collected", "Unknown", False, True, True],
)
)

Expand Down Expand Up @@ -632,7 +647,7 @@ def test_errors__validate(clin_class):
YEAR_DEATH=["Unknown", "Not Collected", "Not Applicable", 19930, 1990],
YEAR_CONTACT=["Unknown", "Not Collected", 1990, 1990, 19940],
INT_CONTACT=[">32485", "<6570", 1990, "Not Collected", ">foobar"],
INT_DOD=[">32485", "<6570", "Unknown", "Not Collected", "<dense"],
INT_DOD=[">32485", "<6570", 1911, "Not Collected", "<dense"],
DEAD=[1, False, "Unknown", "Not Collected", "Not Applicable"],
)
)
Expand Down Expand Up @@ -711,6 +726,10 @@ def test_errors__validate(clin_class):
"Patient Clinical File: Please double check your DEAD column, "
"it must be True, False, 'Unknown', "
"'Not Released' or 'Not Collected'.\n"
"Patient Clinical File: Please double check your INT_DOD "
"and INT_CONTACT columns. INT_DOD must be >= INT_CONTACT. "
"There are 1 row(s) with INT_DOD < INT_CONTACT. "
"Row [2] contain invalid values in the INT_DOD field. Please correct.\n"
"Patient: you have inconsistent redaction and text values in "
"YEAR_CONTACT, INT_CONTACT.\n"
"Patient: you have inconsistent redaction and text values in "
Expand Down Expand Up @@ -1145,6 +1164,85 @@ def test__check_year_death_validity_message(invalid_year_death_indices, expected
assert warning == ""


@pytest.mark.parametrize(
"df,expected_indices",
[
(
pd.DataFrame({"INT_DOD": [420, 555, 390], "INT_CONTACT": [50, 40, 22]}),
[],
),
(
pd.DataFrame(
{
"INT_DOD": [420, float("nan"), 390],
"INT_CONTACT": [50, 40, float("nan")],
}
),
[],
),
(
pd.DataFrame(
{
"INT_DOD": [float("nan"), float("nan"), 390],
"INT_CONTACT": [50, 40, float("nan")],
}
),
[],
),
(
pd.DataFrame({"INT_DOD": [420, 666, 390], "INT_CONTACT": [50, 40, 555]}),
[2],
),
(
pd.DataFrame(
{"INT_DOD": [420, float("nan"), 390], "INT_CONTACT": [50, 40, 555]}
),
[2],
),
],
ids=[
"valid_dataframe_no_NAs",
"valid_dataframe_w_NAs",
"valid_dataframe_all_NAs",
"invalid_dataframe_no_NAs",
"invalid_dataframe_w_NAs",
],
)
def test__check_int_dod_validity(df, expected_indices):
invalid_int_dod_indices = genie_registry.clinical._check_int_dod_validity(
clinicaldf=df
)
assert expected_indices == invalid_int_dod_indices.tolist()


@pytest.mark.parametrize(
"invalid_int_dod_indices,expected_error",
[
(
pd.Index([]),
"",
),
(
pd.Index([2, 3]),
"Patient Clinical File: Please double check your INT_DOD and INT_CONTACT columns. "
"INT_DOD must be >= INT_CONTACT. "
"There are 2 row(s) with INT_DOD < INT_CONTACT. "
"Row [2, 3] contain invalid values in the INT_DOD field. Please correct.\n",
),
],
ids=[
"valid_dataframe",
"invalid_dataframe",
],
)
def test__check_int_dod_validity_message(invalid_int_dod_indices, expected_error):
error, warning = genie_registry.clinical._check_int_dod_validity_message(
invalid_int_dod_indices
)
assert error == expected_error
assert warning == ""


def get_cross_validate_bed_files_test_cases():
return [
{
Expand Down
Loading