Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GEN-1313] Export detailed columns for NAACCR codes #567

Merged
merged 10 commits into from
May 28, 2024
15 changes: 11 additions & 4 deletions genie/consortium_to_public.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,10 +139,12 @@ def consortiumToPublic(
)

# Clinical release scope filter
# If consortium -> Don't release to public
# TODO: check why this synapse id is hard coded?
clinical_tier_release_scope_synid = databaseSynIdMappingDf["Id"][
databaseSynIdMappingDf["Database"] == "clinical_tier_release_scope"
][0]
publicRelease = extract.get_syntabledf(
syn=syn, query_string="SELECT * FROM syn8545211 where releaseScope = 'public'"
syn=syn,
query_string=f"SELECT * FROM {clinical_tier_release_scope_synid} where releaseScope = 'public'",
)

allClin = clinicalDf[clinicalDf["SAMPLE_ID"].isin(publicReleaseSamples)]
Expand Down Expand Up @@ -186,7 +188,12 @@ def consortiumToPublic(
)

# Grab mapping table to fill in clinical headers
mapping = extract.get_syntabledf(syn=syn, query_string="SELECT * FROM syn9621600")
clinical_code_to_desc_map_synid = databaseSynIdMappingDf["Id"][
databaseSynIdMappingDf["Database"] == "clinical_code_to_desc_map"
][0]
mapping = extract.get_syntabledf(
syn=syn, query_string=f"SELECT * FROM {clinical_code_to_desc_map_synid}"
)
genePanelEntities = []
for entName, entId in consortiumRelease[2]:
is_deprecated_file = entName in ["data_fusions.txt"]
Expand Down
17 changes: 14 additions & 3 deletions genie/database_to_staging.py
Original file line number Diff line number Diff line change
Expand Up @@ -1013,6 +1013,7 @@ def store_clinical_files(
release_synid,
current_release_staging,
center_mappingdf,
databaseSynIdMappingDf,
used=None,
):
"""
Expand All @@ -1030,6 +1031,7 @@ def store_clinical_files(
release_synid: Synapse id to store release file
current_release_staging: Staging flag
center_mappingdf: Center mapping dataframe
databaseSynIdMappingDf: Database to Synapse Id mapping

Returns:
pandas.DataFrame: configured clinical dataframe
Expand Down Expand Up @@ -1154,7 +1156,12 @@ def store_clinical_files(
keep_merged_consortium_samples = clinicaldf.SAMPLE_ID
# This mapping table is the GENIE clinical code to description
# mapping to generate the headers of the clinical file
mapping = extract.get_syntabledf(syn=syn, query_string="SELECT * FROM syn9621600")
clinical_code_to_desc_map_synid = databaseSynIdMappingDf["Id"][
databaseSynIdMappingDf["Database"] == "clinical_code_to_desc_map"
][0]
mapping = extract.get_syntabledf(
syn=syn, query_string=f"SELECT * FROM {clinical_code_to_desc_map_synid}"
)
clinical_path = os.path.join(GENIE_RELEASE_DIR, "data_clinical.txt")
clinical_sample_path = os.path.join(GENIE_RELEASE_DIR, "data_clinical_sample.txt")
clinical_patient_path = os.path.join(GENIE_RELEASE_DIR, "data_clinical_patient.txt")
Expand Down Expand Up @@ -1564,6 +1571,9 @@ def stagingToCbio(
sv_synid = databaseSynIdMappingDf["Id"][databaseSynIdMappingDf["Database"] == "sv"][
0
]
clinical_tier_release_scope_synid = databaseSynIdMappingDf["Id"][
databaseSynIdMappingDf["Database"] == "clinical_tier_release_scope"
][0]
# Grab assay information
assay_info_ind = databaseSynIdMappingDf["Database"] == "assayinfo"
assay_info_synid = databaseSynIdMappingDf["Id"][assay_info_ind][0]
Expand Down Expand Up @@ -1592,7 +1602,8 @@ def stagingToCbio(
# Clinical release scope filter
# If private -> Don't release to public
clinicalReleaseScopeDf = extract.get_syntabledf(
syn, "SELECT * FROM syn8545211 where releaseScope <> 'private'"
syn,
f"SELECT * FROM {clinical_tier_release_scope_synid} where releaseScope <> 'private'",
)

patientCols = clinicalReleaseScopeDf["fieldName"][
Expand Down Expand Up @@ -1655,6 +1666,7 @@ def stagingToCbio(
consortiumReleaseSynId,
current_release_staging,
CENTER_MAPPING_DF,
databaseSynIdMappingDf,
used=[sample_used, patient_used],
)

Expand Down Expand Up @@ -1884,7 +1896,6 @@ def create_link_version(
]
if clinical_ent:
# Set private permission for the data_clinical.txt link
syn.setPermissions(clinical_ent[0], principalId=3346558, accessType=[])
syn.setPermissions(clinical_ent[0], principalId=3326313, accessType=[])

for ents in case_list_entities:
Expand Down
23 changes: 18 additions & 5 deletions genie_registry/clinical.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,8 +306,16 @@ def remap_clinical_values(
sampletype_mapping.index = sampletype_mapping["CODE"]
sampletype_dict = sampletype_mapping.to_dict()

if clinicaldf.get("SAMPLE_TYPE") is not None:
clinicaldf["SAMPLE_TYPE_DETAILED"] = clinicaldf["SAMPLE_TYPE"]
for column in [
"PRIMARY_RACE",
"SECONDARY_RACE",
"TERTIARY_RACE",
"SEX",
"ETHNICITY",
"SAMPLE_TYPE",
]:
if column in clinicaldf.columns:
clinicaldf[f"{column}_DETAILED"] = clinicaldf[column]

# Use pandas mapping feature
clinicaldf = clinicaldf.replace(
Expand All @@ -316,9 +324,14 @@ def remap_clinical_values(
"SECONDARY_RACE": race_dict["CBIO_LABEL"],
"TERTIARY_RACE": race_dict["CBIO_LABEL"],
"SAMPLE_TYPE": sampletype_dict["CBIO_LABEL"],
"SAMPLE_TYPE_DETAILED": sampletype_dict["DESCRIPTION"],
"SEX": sex_dict["CBIO_LABEL"],
"ETHNICITY": ethnicity_dict["CBIO_LABEL"],
"PRIMARY_RACE_DETAILED": race_dict["DESCRIPTION"],
"SECONDARY_RACE_DETAILED": race_dict["DESCRIPTION"],
"TERTIARY_RACE_DETAILED": race_dict["DESCRIPTION"],
"SAMPLE_TYPE_DETAILED": sampletype_dict["DESCRIPTION"],
"SEX_DETAILED": sex_dict["DESCRIPTION"],
"ETHNICITY_DETAILED": ethnicity_dict["DESCRIPTION"],
}
)

Expand Down Expand Up @@ -481,12 +494,12 @@ def preprocess(self, newpath):
# hardcoded because it never changes
# TODO: Add clinical tier release scope to GENIE config
patient_cols_table = self.syn.tableQuery(
"select fieldName from syn8545211 where "
f"select fieldName from {self.genie_config['clinical_tier_release_scope']} where "
"patient is True and inClinicalDb is True"
)
patient_cols = patient_cols_table.asDataFrame()["fieldName"].tolist()
sample_cols_table = self.syn.tableQuery(
"select fieldName from syn8545211 where "
f"select fieldName from {self.genie_config['clinical_tier_release_scope']} where "
"sample is True and inClinicalDb is True"
)
sample_cols = sample_cols_table.asDataFrame()["fieldName"].tolist()
Expand Down
1 change: 1 addition & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ def genie_config():
"race_mapping": "syn7434236",
"sex_mapping": "syn7434222",
"sampletype_mapping": "syn7434273",
"clinical_tier_release_scope": "syn8545211",
}
return config

Expand Down
100 changes: 95 additions & 5 deletions tests/test_clinical.py
danlu1 marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import synapseclient
from genie import process_functions, validate
from genie_registry.clinical import Clinical
import pdb


def createMockTable(dataframe):
Expand Down Expand Up @@ -37,11 +38,32 @@ def table_query_results(*args):
)
)

patientdf = pd.DataFrame(
dict(
fieldName=["PATIENT_ID", "SEX", "PRIMARY_RACE"],
patient=[True, True, True],
sample=[True, False, False],
)
)
sampledf = pd.DataFrame(
dict(
fieldName=["PATIENT_ID", "SAMPLE_ID"],
patient=[True, False],
sample=[True, True],
)
)

table_query_results_map = {
("select * from syn7434222",): createMockTable(sexdf),
("select * from syn7434236",): createMockTable(no_nan),
("select * from syn7434242",): createMockTable(no_nan),
("select * from syn7434273",): createMockTable(no_nan),
(
"select fieldName from syn8545211 where patient is True and inClinicalDb is True",
): createMockTable(patientdf),
(
"select fieldName from syn8545211 where sample is True and inClinicalDb is True",
): createMockTable(sampledf),
}

json_oncotreeurl = (
Expand Down Expand Up @@ -952,16 +974,61 @@ def test_remap_clinical_values_sampletype():


@pytest.mark.parametrize(
"col", ["SEX", "PRIMARY_RACE", "SECONDARY_RACE", "TERTIARY_RACE", "ETHNICITY"]
("testdf", "expecteddf"),
[
(
pd.DataFrame(
{
"SEX": [1, 2, 99],
"PRIMARY_RACE": [1, 2, 99],
"SECONDARY_RACE": [1, 2, 99],
"TERTIARY_RACE": [1, 2, 99],
"ETHNICITY": [1, 2, 99],
}
),
pd.DataFrame(
{
"SEX": ["Male", "Female", "Unknown"],
"PRIMARY_RACE": ["Male", "Female", "Unknown"],
"SECONDARY_RACE": ["Male", "Female", "Unknown"],
"TERTIARY_RACE": ["Male", "Female", "Unknown"],
"ETHNICITY": ["Male", "Female", "Unknown"],
"ETHNICITY_DETAILED": ["Male", "Female", "Not coded"],
"PRIMARY_RACE_DETAILED": ["Male", "Female", "Not coded"],
"SECONDARY_RACE_DETAILED": ["Male", "Female", "Not coded"],
"SEX_DETAILED": ["Male", "Female", "Not coded"],
"TERTIARY_RACE_DETAILED": ["Male", "Female", "Not coded"],
}
),
),
(
pd.DataFrame({"SEX": [1, 2, 99], "PRIMARY_RACE": [1, 2, 99]}),
pd.DataFrame(
{
"SEX": ["Male", "Female", "Unknown"],
"PRIMARY_RACE": ["Male", "Female", "Unknown"],
"PRIMARY_RACE_DETAILED": ["Male", "Female", "Not coded"],
"SEX_DETAILED": ["Male", "Female", "Not coded"],
}
),
),
(
pd.DataFrame({"CENTER": [1, 2, 99]}),
pd.DataFrame(
{
"CENTER": [1, 2, 99],
}
),
),
],
ids=["all_detailed_columns", "some_detailed_columns", "no_detailed_columns"],
)
def test_remap_clinical_values(col):
def test_remap_clinical_values(testdf, expecteddf):
"""Test Remapping clinical values"""
testdf = pd.DataFrame({col: [1, 2, 99]})
expecteddf = pd.DataFrame({col: ["Male", "Female", "Unknown"]})
remappeddf = genie_registry.clinical.remap_clinical_values(
testdf, sexdf, sexdf, sexdf, sexdf
)
assert expecteddf.equals(remappeddf)
assert expecteddf.sort_index(axis=1).equals(remappeddf.sort_index(axis=1))


def test__check_int_year_consistency_valid():
Expand Down Expand Up @@ -1555,3 +1622,26 @@ def test_that__cross_validate_assay_info_has_seq_returns_expected_msg_if_valid(
)
assert warnings == expected_warning
assert errors == expected_error


def test_preprocess(clin_class, newpath=None):
"""Test preprocess function"""
expected = {
"clinicalTemplate": pd.DataFrame(
columns=["PATIENT_ID", "SEX", "PRIMARY_RACE", "SAMPLE_ID"]
),
"sample": True,
"patient": True,
"patientCols": ["PATIENT_ID", "SEX", "PRIMARY_RACE"],
"sampleCols": ["PATIENT_ID", "SAMPLE_ID"],
}
results = clin_class.preprocess(newpath)
assert (
results["clinicalTemplate"]
.sort_index(axis=1)
.equals(expected["clinicalTemplate"].sort_index(axis=1))
)
assert results["sample"] == expected["sample"]
assert results["patient"] == expected["patient"]
assert results["patientCols"] == expected["patientCols"]
assert results["sampleCols"] == expected["sampleCols"]
Loading