Skip to content

Commit

Permalink
Merge branch 'develop' into gen-1263-fix-bed-file-dups
Browse files Browse the repository at this point in the history
  • Loading branch information
rxu17 committed May 1, 2024
2 parents c68f687 + 4c0f625 commit 2ff972c
Show file tree
Hide file tree
Showing 4 changed files with 362 additions and 28 deletions.
13 changes: 6 additions & 7 deletions genie/process_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,16 @@
import os
import time
from typing import Optional, Union
import yaml

import pandas as pd
import requests
import synapseclient
import yaml
from genie import extract
from requests.adapters import HTTPAdapter
from synapseclient import Synapse
from urllib3.util import Retry

from genie import extract

pd.options.mode.chained_assignment = None

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -166,16 +165,16 @@ def checkUrl(url):


# TODO Add to validate.py
def checkColExist(DF: pd.DataFrame, key: Union[str, int]):
def checkColExist(DF: pd.DataFrame, key: Union[str, int, list]) -> bool:
"""
This function checks if the column exists in a dataframe
This function checks if the column(s) exist(s) in a dataframe
Args:
DF: pandas dataframe
key: Expected column header name
key: Expected column header name(s)
Returns:
bool: True if column exists
bool: True if column(s) exist(s)
"""
result = False if DF.get(key) is None else True
return result
Expand Down
131 changes: 127 additions & 4 deletions genie_registry/clinical.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,17 @@

# from __future__ import annotations
import datetime
from io import StringIO
import logging
import os
from typing import Optional
from io import StringIO
from typing import Optional, Tuple

import numpy as np
import pandas as pd
import synapseclient

from genie.example_filetype_format import FileTypeFormat
from genie import extract, load, process_functions, validate
from genie.database_to_staging import redact_phi
from genie.example_filetype_format import FileTypeFormat

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -175,6 +175,108 @@ def _check_int_year_consistency(
return ""


def _check_year_death_validity(clinicaldf: pd.DataFrame) -> pd.Index:
"""
YEAR_DEATH should alway be greater than or equal to YEAR_CONTACT when they are both available.
This function checks if YEAR_DEATH >= YEAR_CONTACT and returns row indices of invalid YEAR_DEATH rows.
Args:
clinicaldf: Clinical Data Frame
Returns:
pd.Index: The row indices of the row with YEAR_DEATH < YEAR_CONTACT in the input clinical data
"""
# Generate temp dataframe to handle datatype mismatch in a column
temp = clinicaldf[["YEAR_DEATH", "YEAR_CONTACT"]].copy()
# Convert YEAR_DEATH and YEAR_CONTACT to numeric, coercing errors to NaN
temp["YEAR_DEATH"] = pd.to_numeric(temp["YEAR_DEATH"], errors="coerce")
temp["YEAR_CONTACT"] = pd.to_numeric(temp["YEAR_CONTACT"], errors="coerce")
# Compare rows with numeric values in both YEAR_DEATH and YEAR_CONTACT columns
temp["check_result"] = np.where(
(pd.isna(temp["YEAR_DEATH"]) | pd.isna(temp["YEAR_CONTACT"])),
"N/A",
temp["YEAR_DEATH"] >= temp["YEAR_CONTACT"],
)
invalid_year_death = temp[temp["check_result"] == "False"]
return invalid_year_death.index


def _check_year_death_validity_message(
invalid_year_death_indices: pd.Index,
) -> Tuple[str, str]:
"""This function returns the error and warning messages
if the input clinical data has row with YEAR_DEATH < YEAR_CONTACT
Args:
invalid_year_death_indices: The row indices of the rows with YEAR_DEATH < YEAR_CONTACT in the input clinical data
Returns:
Tuple[str, str]: The error message that tells you how many patients with invalid YEAR_DEATH values that your
input clinical data has
"""
error = ""
warning = ""
if len(invalid_year_death_indices) > 0:
error = (
"Patient Clinical File: Please double check your YEAR_DEATH and YEAR_CONTACT columns. "
"YEAR_DEATH must be >= YEAR_CONTACT. "
f"There are {len(invalid_year_death_indices)} row(s) with YEAR_DEATH < YEAR_CONTACT. "
f"The row number(s) this occurs in are: {invalid_year_death_indices.tolist()}. Please correct.\n"
)
return error, warning


def _check_int_dod_validity(clinicaldf: pd.DataFrame) -> pd.Index:
"""
INT_DOD should alway be greater than or equal to INT_CONTACT when they are both available.
This function checks if INT_DOD >= INT_CONTACT and returns row indices of invalid INT_DOD rows.
Args:
clinicaldf: Clinical Data Frame
Returns:
pd.Index: The row indices of the row with INT_DOD < INT_CONTACT in the input clinical data
"""
# Generate temp dataframe to handle datatype mismatch in a column
temp = clinicaldf[["INT_DOD", "INT_CONTACT"]].copy()
# Convert INT_DOD and INT_CONTACT to numeric, coercing errors to NaN
temp["INT_DOD"] = pd.to_numeric(temp["INT_DOD"], errors="coerce")
temp["INT_CONTACT"] = pd.to_numeric(temp["INT_CONTACT"], errors="coerce")
# Compare rows with numeric values in both INT_DOD and INT_CONTACT columns
temp["check_result"] = np.where(
(pd.isna(temp["INT_DOD"]) | pd.isna(temp["INT_CONTACT"])),
"N/A",
temp["INT_DOD"] >= temp["INT_CONTACT"],
)
invalid_int_dod = temp[temp["check_result"] == "False"]
return invalid_int_dod.index


def _check_int_dod_validity_message(
invalid_int_dod_indices: pd.Index,
) -> Tuple[str, str]:
"""This function returns the error and warning messages
if the input clinical data has row with INT_DOD < INT_CONTACT
Args:
invalid_int_dod_indices: The row indices of the rows with INT_DOD < INT_CONTACT in the input clinical data
Returns:
Tuple[str, str]: The error message that tells you how many patients with invalid INT_DOD values that your
input clinical data has
"""
error = ""
warning = ""
if len(invalid_int_dod_indices) > 0:
error = (
"Patient Clinical File: Please double check your INT_DOD and INT_CONTACT columns. "
"INT_DOD must be >= INT_CONTACT. "
f"There are {len(invalid_int_dod_indices)} row(s) with INT_DOD < INT_CONTACT. "
f"The row number(s) this occurs in are: {invalid_int_dod_indices.tolist()}. Please correct.\n"
)
return error, warning


# PROCESSING
def remap_clinical_values(
clinicaldf: pd.DataFrame,
Expand Down Expand Up @@ -823,6 +925,17 @@ def _validate(self, clinicaldf):
)
total_error.write(error)

# CHECK: YEAR DEATH against YEAR CONTACT
has_death_and_contact_years = process_functions.checkColExist(
clinicaldf, ["YEAR_DEATH", "YEAR_CONTACT"]
)
if has_death_and_contact_years:
invalid_year_death_indices = _check_year_death_validity(clinicaldf)
errors, warnings = _check_year_death_validity_message(
invalid_year_death_indices
)
total_error.write(errors)

# CHECK: INT CONTACT
haveColumn = process_functions.checkColExist(clinicaldf, "INT_CONTACT")
if haveColumn:
Expand Down Expand Up @@ -892,6 +1005,16 @@ def _validate(self, clinicaldf):
)
else:
total_error.write("Patient Clinical File: Must have DEAD column.\n")

# CHECK: INT DOD against INT CONTACT
has_int_dod_and_contact = process_functions.checkColExist(
clinicaldf, ["INT_DOD", "INT_CONTACT"]
)
if has_int_dod_and_contact:
invalid_int_dod_indices = _check_int_dod_validity(clinicaldf)
errors, warnings = _check_int_dod_validity_message(invalid_int_dod_indices)
total_error.write(errors)

# CHECK: contact vital status value consistency
contact_error = _check_int_year_consistency(
clinicaldf=clinicaldf,
Expand Down
Loading

0 comments on commit 2ff972c

Please sign in to comment.