Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GEN-868] add year death validation #560

Merged
merged 9 commits into from
Apr 24, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 6 additions & 7 deletions genie/process_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,16 @@
import os
import time
from typing import Optional, Union
import yaml

import pandas as pd
import requests
import synapseclient
import yaml
from genie import extract
from requests.adapters import HTTPAdapter
from synapseclient import Synapse
from urllib3.util import Retry

from genie import extract

pd.options.mode.chained_assignment = None

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -166,16 +165,16 @@ def checkUrl(url):


# TODO Add to validate.py
def checkColExist(DF: pd.DataFrame, key: Union[str, int]):
def checkColExist(DF: pd.DataFrame, key: Union[str, int, list]) -> bool:
danlu1 marked this conversation as resolved.
Show resolved Hide resolved
"""
This function checks if the column exists in a dataframe
This function checks if the column(s) exist(s) in a dataframe

Args:
DF: pandas dataframe
key: Expected column header name
key: Expected column header name(s)

Returns:
bool: True if column exists
bool: True if column(s) exist(s)
"""
result = False if DF.get(key) is None else True
return result
Expand Down
41 changes: 38 additions & 3 deletions genie_registry/clinical.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,17 @@

# from __future__ import annotations
import datetime
from io import StringIO
import logging
import os
from io import StringIO
from typing import Optional

import numpy as np
import pandas as pd
import synapseclient

from genie.example_filetype_format import FileTypeFormat
from genie import extract, load, process_functions, validate
from genie.database_to_staging import redact_phi
from genie.example_filetype_format import FileTypeFormat

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -175,6 +175,33 @@ def _check_int_year_consistency(
return ""


def _check_year_death_validity(clinicaldf: pd.DataFrame) -> str:
"""
Check if YEAR_DEATH >= YEAR_CONTACT
danlu1 marked this conversation as resolved.
Show resolved Hide resolved

Args:
clinicaldf: Clinical Data Frame

Returns:
Error message if YEAR_DEATH if invalid
"""
error = ""
# Generate temp dataframe to handle datatype mismatch in a column
temp = clinicaldf.copy()
danlu1 marked this conversation as resolved.
Show resolved Hide resolved
# Convert YEAR_DEATH and YEAR_CONTACT to numeric, coercing errors to NaN
temp["YEAR_DEATH"] = pd.to_numeric(temp["YEAR_DEATH"], errors="coerce")
temp["YEAR_CONTACT"] = pd.to_numeric(temp["YEAR_CONTACT"], errors="coerce")
check_result = np.where(
(pd.isna(temp["YEAR_DEATH"]) | pd.isna(temp["YEAR_CONTACT"])),
"N/A",
temp["YEAR_DEATH"] >= temp["YEAR_CONTACT"],
)
index = [",".join(str(idx)) for idx, i in enumerate(check_result) if i == "False"]
danlu1 marked this conversation as resolved.
Show resolved Hide resolved
if "False" in check_result:
error = "Patient Clinical File: Please double check your YEAR_DEATH and YEAR_CONTACT columns. YEAR_DEATH must be >= YEAR_CONTACT.\n"
danlu1 marked this conversation as resolved.
Show resolved Hide resolved
return error


# PROCESSING
def remap_clinical_values(
clinicaldf: pd.DataFrame,
Expand Down Expand Up @@ -823,6 +850,14 @@ def _validate(self, clinicaldf):
)
total_error.write(error)

# CHECK: YEAR DEATH against YEAR CONTACT
haveColumn = process_functions.checkColExist(
danlu1 marked this conversation as resolved.
Show resolved Hide resolved
clinicaldf, ["YEAR_DEATH", "YEAR_CONTACT"]
)
if haveColumn:
error = _check_year_death_validity(clinicaldf=clinicaldf)
total_error.write(error)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A stretch goal outside of the explicit scope of this change would be to split out these other items into their own functions. Is there a tech debt jira that already exists, or could one be created to address this item?

Cc: @rxu17

Copy link
Contributor

@rxu17 rxu17 Apr 23, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes we have a tech debt jira/part of the overarching goal for this. The goal is to eventually have a validation function for each validation rule for each fileformat especially when we want to implement GX framework for this.

I think this is part of our documentation epic as well. To break up the validation code so we can document each validation rule individually and completely.

This is not just happening here but throughout the genie code (also for processing as well) but we definitely want to start with validation first.

# CHECK: INT CONTACT
haveColumn = process_functions.checkColExist(clinicaldf, "INT_CONTACT")
if haveColumn:
Expand Down
59 changes: 55 additions & 4 deletions tests/test_clinical.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
from collections import Counter
import datetime
import json
from collections import Counter
from unittest import mock

import genie_registry
import pandas as pd
import pytest
import synapseclient

from genie import process_functions, validate
import genie_registry
from genie_registry.clinical import Clinical


Expand Down Expand Up @@ -662,7 +661,6 @@ def test_errors__validate(clin_class):
) as mock_get_onco_map:
error, warning = clin_class._validate(clinicalDf)
mock_get_onco_map.called_once_with(json_oncotreeurl)

expectedErrors = (
"Sample Clinical File: SAMPLE_ID must start with GENIE-SAGE\n"
"Patient Clinical File: PATIENT_ID must start with GENIE-SAGE\n"
Expand Down Expand Up @@ -700,6 +698,8 @@ def test_errors__validate(clin_class):
"Patient Clinical File: Please double check your YEAR_CONTACT "
"column, it must be an integer in YYYY format <= {year} or "
"'Unknown', 'Not Collected', 'Not Released', '>89', '<18'.\n"
"Patient Clinical File: Please double check your YEAR_DEATH "
"and YEAR_CONTACT columns. YEAR_DEATH must be >= YEAR_CONTACT.\n"
"Patient Clinical File: Please double check your INT_CONTACT "
"column, it must be an integer, '>32485', '<6570', 'Unknown', "
"'Not Released' or 'Not Collected'.\n"
Expand Down Expand Up @@ -1062,6 +1062,57 @@ def test__check_int_dead_consistency_inconsistent(inconsistent_df):
)


@pytest.mark.parametrize(
"df,expected_error",
[
(
pd.DataFrame({"YEAR_DEATH": [420, 555, 390], "YEAR_CONTACT": [50, 40, 22]}),
"",
),
(
pd.DataFrame(
{
"YEAR_DEATH": [420, float("nan"), 390],
"YEAR_CONTACT": [50, 40, float("nan")],
}
),
"",
),
(
pd.DataFrame(
{
"YEAR_DEATH": [float("nan"), float("nan"), 390],
"YEAR_CONTACT": [50, 40, float("nan")],
}
),
"",
),
(
pd.DataFrame(
{"YEAR_DEATH": [420, 666, 390], "YEAR_CONTACT": [50, 40, 555]}
),
"Patient Clinical File: Please double check your YEAR_DEATH and YEAR_CONTACT columns. YEAR_DEATH must be >= YEAR_CONTACT.\n",
),
(
pd.DataFrame(
{"YEAR_DEATH": [420, float("nan"), 390], "YEAR_CONTACT": [50, 40, 555]}
),
"Patient Clinical File: Please double check your YEAR_DEATH and YEAR_CONTACT columns. YEAR_DEATH must be >= YEAR_CONTACT.\n",
),
],
ids=[
"valid_dataframe_no_NAs",
"valid_dataframe_w_NAs",
"valid_dataframe_all_NAs",
"invalid_dataframe_no_NAs",
"invalid_dataframe_w_NAs",
],
)
def test__check_year_death_validity(df, expected_error):
error = genie_registry.clinical._check_year_death_validity(clinicaldf=df)
assert error == expected_error


def get_cross_validate_bed_files_test_cases():
return [
{
Expand Down
Loading