Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GEN-1313] Export detailed columns for NAACCR codes #567

Merged
merged 10 commits into from
May 28, 2024
15 changes: 11 additions & 4 deletions genie/consortium_to_public.py
Copy link
Contributor

@rxu17 rxu17 May 28, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just a comment. Nothing actionable here. I think it's OK we don't have unit tests for these giant functions: stagingToCbio, store_clinical_files and consortiumToPublic in the database_to_staging and consortium_to_public code just because these are better tested using integration tests and we already did that with our pipeline comparisons and our test runs on the test pipeline.

Doing this would be outside the scope of this ticket and is already part of our tech debt epic to refactor and add tests.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I agree that we add unit tests later for these functions for our tech debt epic.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The more we can spin out "unit functions" within these larger functions, the more they will be better served as integration tests.

If we think of these as ETL, it should tell the story of data processing.

Original file line number Diff line number Diff line change
Expand Up @@ -139,10 +139,12 @@ def consortiumToPublic(
)

# Clinical release scope filter
# If consortium -> Don't release to public
# TODO: check why this synapse id is hard coded?
clinical_tier_release_scope_synid = databaseSynIdMappingDf["Id"][
databaseSynIdMappingDf["Database"] == "clinical_tier_release_scope"
][0]
publicRelease = extract.get_syntabledf(
syn=syn, query_string="SELECT * FROM syn8545211 where releaseScope = 'public'"
syn=syn,
query_string=f"SELECT * FROM {clinical_tier_release_scope_synid} where releaseScope = 'public'",
)

allClin = clinicalDf[clinicalDf["SAMPLE_ID"].isin(publicReleaseSamples)]
Expand Down Expand Up @@ -186,7 +188,12 @@ def consortiumToPublic(
)

# Grab mapping table to fill in clinical headers
mapping = extract.get_syntabledf(syn=syn, query_string="SELECT * FROM syn9621600")
clinical_code_to_desc_map_synid = databaseSynIdMappingDf["Id"][
databaseSynIdMappingDf["Database"] == "clinical_code_to_desc_map"
][0]
mapping = extract.get_syntabledf(
syn=syn, query_string=f"SELECT * FROM {clinical_code_to_desc_map_synid}"
)
genePanelEntities = []
for entName, entId in consortiumRelease[2]:
is_deprecated_file = entName in ["data_fusions.txt"]
Expand Down
17 changes: 14 additions & 3 deletions genie/database_to_staging.py
Original file line number Diff line number Diff line change
Expand Up @@ -1013,6 +1013,7 @@ def store_clinical_files(
release_synid,
current_release_staging,
center_mappingdf,
databaseSynIdMappingDf,
used=None,
):
"""
Expand All @@ -1030,6 +1031,7 @@ def store_clinical_files(
release_synid: Synapse id to store release file
current_release_staging: Staging flag
center_mappingdf: Center mapping dataframe
databaseSynIdMappingDf: Database to Synapse Id mapping

Returns:
pandas.DataFrame: configured clinical dataframe
Expand Down Expand Up @@ -1154,7 +1156,12 @@ def store_clinical_files(
keep_merged_consortium_samples = clinicaldf.SAMPLE_ID
# This mapping table is the GENIE clinical code to description
# mapping to generate the headers of the clinical file
mapping = extract.get_syntabledf(syn=syn, query_string="SELECT * FROM syn9621600")
clinical_code_to_desc_map_synid = databaseSynIdMappingDf["Id"][
databaseSynIdMappingDf["Database"] == "clinical_code_to_desc_map"
][0]
mapping = extract.get_syntabledf(
syn=syn, query_string=f"SELECT * FROM {clinical_code_to_desc_map_synid}"
)
clinical_path = os.path.join(GENIE_RELEASE_DIR, "data_clinical.txt")
clinical_sample_path = os.path.join(GENIE_RELEASE_DIR, "data_clinical_sample.txt")
clinical_patient_path = os.path.join(GENIE_RELEASE_DIR, "data_clinical_patient.txt")
Expand Down Expand Up @@ -1564,6 +1571,9 @@ def stagingToCbio(
sv_synid = databaseSynIdMappingDf["Id"][databaseSynIdMappingDf["Database"] == "sv"][
0
]
clinical_tier_release_scope_synid = databaseSynIdMappingDf["Id"][
databaseSynIdMappingDf["Database"] == "clinical_tier_release_scope"
][0]
# Grab assay information
assay_info_ind = databaseSynIdMappingDf["Database"] == "assayinfo"
assay_info_synid = databaseSynIdMappingDf["Id"][assay_info_ind][0]
Expand Down Expand Up @@ -1592,7 +1602,8 @@ def stagingToCbio(
# Clinical release scope filter
# If private -> Don't release to public
clinicalReleaseScopeDf = extract.get_syntabledf(
syn, "SELECT * FROM syn8545211 where releaseScope <> 'private'"
syn,
f"SELECT * FROM {clinical_tier_release_scope_synid} where releaseScope <> 'private'",
)

patientCols = clinicalReleaseScopeDf["fieldName"][
Expand Down Expand Up @@ -1655,6 +1666,7 @@ def stagingToCbio(
consortiumReleaseSynId,
current_release_staging,
CENTER_MAPPING_DF,
databaseSynIdMappingDf,
used=[sample_used, patient_used],
)

Expand Down Expand Up @@ -1884,7 +1896,6 @@ def create_link_version(
]
if clinical_ent:
# Set private permission for the data_clinical.txt link
syn.setPermissions(clinical_ent[0], principalId=3346558, accessType=[])
syn.setPermissions(clinical_ent[0], principalId=3326313, accessType=[])

for ents in case_list_entities:
Expand Down
23 changes: 18 additions & 5 deletions genie_registry/clinical.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,8 +306,16 @@ def remap_clinical_values(
sampletype_mapping.index = sampletype_mapping["CODE"]
sampletype_dict = sampletype_mapping.to_dict()

if clinicaldf.get("SAMPLE_TYPE") is not None:
clinicaldf["SAMPLE_TYPE_DETAILED"] = clinicaldf["SAMPLE_TYPE"]
for column in [
"PRIMARY_RACE",
"SECONDARY_RACE",
"TERTIARY_RACE",
"SEX",
"ETHNICITY",
"SAMPLE_TYPE",
]:
if column in clinicaldf.columns:
clinicaldf[f"{column}_DETAILED"] = clinicaldf[column]

# Use pandas mapping feature
clinicaldf = clinicaldf.replace(
Expand All @@ -316,9 +324,14 @@ def remap_clinical_values(
"SECONDARY_RACE": race_dict["CBIO_LABEL"],
"TERTIARY_RACE": race_dict["CBIO_LABEL"],
"SAMPLE_TYPE": sampletype_dict["CBIO_LABEL"],
"SAMPLE_TYPE_DETAILED": sampletype_dict["DESCRIPTION"],
"SEX": sex_dict["CBIO_LABEL"],
"ETHNICITY": ethnicity_dict["CBIO_LABEL"],
"PRIMARY_RACE_DETAILED": race_dict["DESCRIPTION"],
"SECONDARY_RACE_DETAILED": race_dict["DESCRIPTION"],
"TERTIARY_RACE_DETAILED": race_dict["DESCRIPTION"],
"SAMPLE_TYPE_DETAILED": sampletype_dict["DESCRIPTION"],
"SEX_DETAILED": sex_dict["DESCRIPTION"],
"ETHNICITY_DETAILED": ethnicity_dict["DESCRIPTION"],
}
)

Expand Down Expand Up @@ -481,12 +494,12 @@ def preprocess(self, newpath):
# hardcoded because it never changes
# TODO: Add clinical tier release scope to GENIE config
patient_cols_table = self.syn.tableQuery(
"select fieldName from syn8545211 where "
f"select fieldName from {self.genie_config['clinical_tier_release_scope']} where "
"patient is True and inClinicalDb is True"
)
patient_cols = patient_cols_table.asDataFrame()["fieldName"].tolist()
sample_cols_table = self.syn.tableQuery(
"select fieldName from syn8545211 where "
f"select fieldName from {self.genie_config['clinical_tier_release_scope']} where "
"sample is True and inClinicalDb is True"
)
sample_cols = sample_cols_table.asDataFrame()["fieldName"].tolist()
Expand Down
11 changes: 9 additions & 2 deletions tests/test_clinical.py
danlu1 marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -956,8 +956,15 @@ def test_remap_clinical_values_sampletype():
)
def test_remap_clinical_values(col):
"""Test Remapping clinical values"""
testdf = pd.DataFrame({col: [1, 2, 99]})
expecteddf = pd.DataFrame({col: ["Male", "Female", "Unknown"]})
testdf = pd.DataFrame({"SEX": [1, 2, 99], "PRIMARY_RACE": [1, 2, 99]})
danlu1 marked this conversation as resolved.
Show resolved Hide resolved
expecteddf = pd.DataFrame(
{
"SEX": ["Male", "Female", "Unknown"],
"PRIMARY_RACE": ["Male", "Female", "Unknown"],
"PRIMARY_RACE_DETAILED": ["Male", "Female", "Not coded"],
"SEX_DETAILED": ["Male", "Female", "Not coded"],
}
)
remappeddf = genie_registry.clinical.remap_clinical_values(
testdf, sexdf, sexdf, sexdf, sexdf
)
Expand Down
Loading