Skip to content

Commit

Permalink
[GEN-1000] Refactor and sort unmapped oncotree codes in validation me…
Browse files Browse the repository at this point in the history
…ssage (#558)

* update method to concatenate columns

* revert to origincal join function but add checks for empty df

* Update test_load.py

remove unused package

* refactor _update_table

* add test cases for seperated functions

* Update test_load.py

remove unused modules

* separate out oncotree code validation, sort oncotree codes

* add tests with duplicated unmapped codes

* make oncotree list more readable

---------

Co-authored-by: danlu1 <[email protected]>
Co-authored-by: Dan Lu <[email protected]>
  • Loading branch information
3 people authored Apr 19, 2024
1 parent e2c2321 commit 0e81107
Show file tree
Hide file tree
Showing 2 changed files with 138 additions and 22 deletions.
91 changes: 69 additions & 22 deletions genie_registry/clinical.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from io import StringIO
import logging
import os
from typing import Optional
from typing import Optional, Tuple

import pandas as pd
import synapseclient
Expand Down Expand Up @@ -472,6 +472,68 @@ def process_steps(
newClinicalDf.to_csv(newPath, sep="\t", index=False)
return newPath

def _validate_oncotree_code_mapping(
self: "Clinical", clinicaldf: pd.DataFrame, oncotree_mapping: pd.DataFrame
) -> pd.Index:
"""Checks that the oncotree codes in the input clinical
data is a valid oncotree code from the official oncotree site
Args:
clinicaldf (pd.DataFrame): clinical input data to validate
oncotree_mapping (pd.DataFrame): table of official oncotree
mappings
Returns:
pd.Index: row indices of unmapped oncotree codes in the
input clinical data
"""
# Make oncotree codes uppercase (SpCC/SPCC)
clinicaldf["ONCOTREE_CODE"] = (
clinicaldf["ONCOTREE_CODE"].astype(str).str.upper()
)

unmapped_oncotrees = clinicaldf[
(clinicaldf["ONCOTREE_CODE"] != "UNKNOWN")
& ~(clinicaldf["ONCOTREE_CODE"].isin(oncotree_mapping["ONCOTREE_CODE"]))
]
return unmapped_oncotrees.index

def _validate_oncotree_code_mapping_message(
self: "Clinical",
clinicaldf: pd.DataFrame,
unmapped_oncotree_indices: pd.DataFrame,
) -> Tuple[str, str]:
"""This function returns the error and warning messages
if the input clinical data has row indices with unmapped
oncotree codes
Args:
clinicaldf (pd.DataFrame): input clinical data
unmapped_oncotree_indices (pd.DataFrame): row indices of the
input clinical data with unmapped oncotree codes
Returns:
Tuple[str, str]: error message that tells you how many
samples AND the unique unmapped oncotree codes that your
input clinical data has
"""
errors = ""
warnings = ""
if len(unmapped_oncotree_indices) > 0:
# sort the unique unmapped oncotree codes
unmapped_oncotree_codes = sorted(
set(clinicaldf.loc[unmapped_oncotree_indices]["ONCOTREE_CODE"])
)
errors = (
"Sample Clinical File: Please double check that all your "
"ONCOTREE CODES exist in the mapping. You have {} samples "
"that don't map. These are the codes that "
"don't map: {}\n".format(
len(unmapped_oncotree_indices), ",".join(unmapped_oncotree_codes)
)
)
return errors, warnings

# VALIDATION
def _validate(self, clinicaldf):
"""
Expand Down Expand Up @@ -641,28 +703,13 @@ def _validate(self, clinicaldf):
maleOncoCodes = ["TESTIS", "PROSTATE", "PENIS"]
womenOncoCodes = ["CERVIX", "VULVA", "UTERUS", "OVARY"]
if haveColumn:
# Make oncotree codes uppercase (SpCC/SPCC)
clinicaldf["ONCOTREE_CODE"] = (
clinicaldf["ONCOTREE_CODE"].astype(str).str.upper()
unmapped_indices = self._validate_oncotree_code_mapping(
clinicaldf, oncotree_mapping
)

oncotree_codes = clinicaldf["ONCOTREE_CODE"][
clinicaldf["ONCOTREE_CODE"] != "UNKNOWN"
]

if not all(oncotree_codes.isin(oncotree_mapping["ONCOTREE_CODE"])):
unmapped_oncotrees = oncotree_codes[
~oncotree_codes.isin(oncotree_mapping["ONCOTREE_CODE"])
]
total_error.write(
"Sample Clinical File: Please double check that all your "
"ONCOTREE CODES exist in the mapping. You have {} samples "
"that don't map. These are the codes that "
"don't map: {}\n".format(
len(unmapped_oncotrees),
",".join(set(unmapped_oncotrees)),
)
)
errors, warnings = self._validate_oncotree_code_mapping_message(
clinicaldf, unmapped_indices
)
total_error.write(errors)
# Should add the SEX mismatch into the dashboard file
if (
process_functions.checkColExist(clinicaldf, "SEX")
Expand Down
69 changes: 69 additions & 0 deletions tests/test_clinical.py
Original file line number Diff line number Diff line change
Expand Up @@ -463,6 +463,75 @@ def test_sample__process(clin_class):
assert expected_sampledf.equals(new_sampledf[expected_sampledf.columns])


@pytest.mark.parametrize(
("input_df", "expected_unmapped_indices"),
[
(
pd.DataFrame(
dict(ONCOTREE_CODE=["AMPCA", "AMPCA", "Unknown", "AMPCA", "AMPCA"])
),
[],
),
(
pd.DataFrame(dict(ONCOTREE_CODE=["XXXX", "XX", "TEST", "AMPCA"])),
[0, 1, 2],
),
(
pd.DataFrame(dict(ONCOTREE_CODE=["XXXX", "XX", "TEST", "AMPCA", "XXXX"])),
[0, 1, 2, 4],
),
],
ids=["no_unmapped", "unmapped_unique", "unmapped_dups"],
)
def test__validate_oncotree_code_mapping_returns_expected_unmapped_indices(
clin_class, input_df, expected_unmapped_indices
) -> None:
oncotree_mapping = pd.DataFrame(dict(ONCOTREE_CODE=["AMPCA", "ACA"]))
unmapped_indices = clin_class._validate_oncotree_code_mapping(
clinicaldf=input_df, oncotree_mapping=oncotree_mapping
)
assert expected_unmapped_indices == unmapped_indices.tolist()


@pytest.mark.parametrize(
("input_df", "unmapped_indices", "expected_error"),
[
(
pd.DataFrame(
dict(ONCOTREE_CODE=["AMPCA", "AMPCA", "Unknown", "AMPCA", "AMPCA"])
),
[],
"",
),
(
pd.DataFrame(dict(ONCOTREE_CODE=["XXXX", "ZGT", "TEST", "AMPCA"])),
[0, 1, 2],
"Sample Clinical File: Please double check that all your "
"ONCOTREE CODES exist in the mapping. You have 3 samples "
"that don't map. These are the codes that "
"don't map: TEST,XXXX,ZGT\n",
),
(
pd.DataFrame(dict(ONCOTREE_CODE=["XXXX", "ZGT", "TEST", "AMPCA", "XXXX"])),
[0, 1, 2, 4],
"Sample Clinical File: Please double check that all your "
"ONCOTREE CODES exist in the mapping. You have 4 samples "
"that don't map. These are the codes that "
"don't map: TEST,XXXX,ZGT\n",
),
],
ids=["no_unmapped", "unmapped_unique", "unmapped_dups"],
)
def test__validate_oncotree_code_mapping_message_returns_expected_error_messages(
clin_class, input_df, unmapped_indices, expected_error
):
errors, warnings = clin_class._validate_oncotree_code_mapping_message(
clinicaldf=input_df, unmapped_oncotree_indices=unmapped_indices
)
assert expected_error == errors
assert warnings == ""


def test_perfect__validate(clin_class, valid_clinical_df):
"""
Test perfect validation
Expand Down

0 comments on commit 0e81107

Please sign in to comment.