From f1f8f92c4691fe7adfc5dfa5d2fa420484d35d91 Mon Sep 17 00:00:00 2001 From: danlu1 Date: Fri, 17 May 2024 17:01:58 +0000 Subject: [PATCH 1/9] export detailed columns for NAACCR codes --- genie_registry/clinical.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/genie_registry/clinical.py b/genie_registry/clinical.py index 3468ba7b..bb08f5dd 100644 --- a/genie_registry/clinical.py +++ b/genie_registry/clinical.py @@ -306,8 +306,16 @@ def remap_clinical_values( sampletype_mapping.index = sampletype_mapping["CODE"] sampletype_dict = sampletype_mapping.to_dict() - if clinicaldf.get("SAMPLE_TYPE") is not None: - clinicaldf["SAMPLE_TYPE_DETAILED"] = clinicaldf["SAMPLE_TYPE"] + for column in [ + "PRIMARY_RACE", + "SECONDARY_RACE", + "TERTIARY_RACE", + "SEX", + "ETHNICITY", + "SAMPLE_TYPE", + ]: + if column in clinicaldf.columns: + clinicaldf[f"{column}_DETAILED"] = clinicaldf[column] # Use pandas mapping feature clinicaldf = clinicaldf.replace( @@ -316,9 +324,14 @@ def remap_clinical_values( "SECONDARY_RACE": race_dict["CBIO_LABEL"], "TERTIARY_RACE": race_dict["CBIO_LABEL"], "SAMPLE_TYPE": sampletype_dict["CBIO_LABEL"], - "SAMPLE_TYPE_DETAILED": sampletype_dict["DESCRIPTION"], "SEX": sex_dict["CBIO_LABEL"], "ETHNICITY": ethnicity_dict["CBIO_LABEL"], + "PRIMARY_RACE_DETAILED": race_dict["DESCRIPTION"], + "SECONDARY_RACE_DETAILED": race_dict["DESCRIPTION"], + "TERTIARY_RACE_DETAILED": race_dict["DESCRIPTION"], + "SAMPLE_TYPE_DETAILED": sampletype_dict["DESCRIPTION"], + "SEX_DETAILED": sex_dict["DESCRIPTION"], + "ETHNICITY_DETAILED": ethnicity_dict["DESCRIPTION"], } ) @@ -481,12 +494,12 @@ def preprocess(self, newpath): # hardcoded because it never changes # TODO: Add clinical tier release scope to GENIE config patient_cols_table = self.syn.tableQuery( - "select fieldName from syn8545211 where " + f"select fieldName from {self.genie_config['clinical_tier_release_scope']} where " "patient is True and inClinicalDb is True" ) patient_cols = patient_cols_table.asDataFrame()["fieldName"].tolist() sample_cols_table = self.syn.tableQuery( - "select fieldName from syn8545211 where " + f"select fieldName from {self.genie_config['clinical_tier_release_scope']} where " "sample is True and inClinicalDb is True" ) sample_cols = sample_cols_table.asDataFrame()["fieldName"].tolist() From f6eb440609a65b809e2de43479cad8527f1f9c0c Mon Sep 17 00:00:00 2001 From: danlu1 Date: Fri, 17 May 2024 23:40:52 +0000 Subject: [PATCH 2/9] replace hard code dataframe synid with genie_config --- genie/database_to_staging.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py index c7e1b7c3..86347fdc 100644 --- a/genie/database_to_staging.py +++ b/genie/database_to_staging.py @@ -1013,6 +1013,7 @@ def store_clinical_files( release_synid, current_release_staging, center_mappingdf, + databaseSynIdMappingDf, used=None, ): """ @@ -1030,6 +1031,7 @@ def store_clinical_files( release_synid: Synapse id to store release file current_release_staging: Staging flag center_mappingdf: Center mapping dataframe + databaseSynIdMappingDf: Database to Synapse Id mapping Returns: pandas.DataFrame: configured clinical dataframe @@ -1154,7 +1156,12 @@ def store_clinical_files( keep_merged_consortium_samples = clinicaldf.SAMPLE_ID # This mapping table is the GENIE clinical code to description # mapping to generate the headers of the clinical file - mapping = extract.get_syntabledf(syn=syn, query_string="SELECT * FROM syn9621600") + clinical_code_to_desc_map_synid = databaseSynIdMappingDf["Id"][ + databaseSynIdMappingDf["Database"] == "clinical_code_to_desc_map" + ][0] + mapping = extract.get_syntabledf( + syn=syn, query_string=f"SELECT * FROM {clinical_code_to_desc_map_synid}" + ) clinical_path = os.path.join(GENIE_RELEASE_DIR, "data_clinical.txt") clinical_sample_path = os.path.join(GENIE_RELEASE_DIR, "data_clinical_sample.txt") clinical_patient_path = os.path.join(GENIE_RELEASE_DIR, "data_clinical_patient.txt") @@ -1564,6 +1571,9 @@ def stagingToCbio( sv_synid = databaseSynIdMappingDf["Id"][databaseSynIdMappingDf["Database"] == "sv"][ 0 ] + clinical_tier_release_scope_synid = databaseSynIdMappingDf["Id"][ + databaseSynIdMappingDf["Database"] == "clinical_tier_release_scope" + ][0] # Grab assay information assay_info_ind = databaseSynIdMappingDf["Database"] == "assayinfo" assay_info_synid = databaseSynIdMappingDf["Id"][assay_info_ind][0] @@ -1592,7 +1602,8 @@ def stagingToCbio( # Clinical release scope filter # If private -> Don't release to public clinicalReleaseScopeDf = extract.get_syntabledf( - syn, "SELECT * FROM syn8545211 where releaseScope <> 'private'" + syn, + f"SELECT * FROM {clinical_tier_release_scope_synid} where releaseScope <> 'private'", ) patientCols = clinicalReleaseScopeDf["fieldName"][ From 3cac57ec11c713460ec5dff7629ae66dee00bb2d Mon Sep 17 00:00:00 2001 From: danlu1 Date: Mon, 20 May 2024 18:32:33 +0000 Subject: [PATCH 3/9] replace hard code table synid --- genie/database_to_staging.py | 1 + 1 file changed, 1 insertion(+) diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py index 86347fdc..d4ca65e2 100644 --- a/genie/database_to_staging.py +++ b/genie/database_to_staging.py @@ -1666,6 +1666,7 @@ def stagingToCbio( consortiumReleaseSynId, current_release_staging, CENTER_MAPPING_DF, + databaseSynIdMappingDf, used=[sample_used, patient_used], ) From 91ef4696a545dfd5101949dd1116a43b7942cab1 Mon Sep 17 00:00:00 2001 From: danlu1 Date: Mon, 20 May 2024 19:48:37 +0000 Subject: [PATCH 4/9] update unit test --- tests/test_clinical.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/test_clinical.py b/tests/test_clinical.py index 387023bd..d5fa73a6 100644 --- a/tests/test_clinical.py +++ b/tests/test_clinical.py @@ -956,8 +956,15 @@ def test_remap_clinical_values_sampletype(): ) def test_remap_clinical_values(col): """Test Remapping clinical values""" - testdf = pd.DataFrame({col: [1, 2, 99]}) - expecteddf = pd.DataFrame({col: ["Male", "Female", "Unknown"]}) + testdf = pd.DataFrame({"SEX": [1, 2, 99], "PRIMARY_RACE": [1, 2, 99]}) + expecteddf = pd.DataFrame( + { + "SEX": ["Male", "Female", "Unknown"], + "PRIMARY_RACE": ["Male", "Female", "Unknown"], + "PRIMARY_RACE_DETAILED": ["Male", "Female", "Not coded"], + "SEX_DETAILED": ["Male", "Female", "Not coded"], + } + ) remappeddf = genie_registry.clinical.remap_clinical_values( testdf, sexdf, sexdf, sexdf, sexdf ) From c93116eef2a58795c186bae0c9cc01ea91f21495 Mon Sep 17 00:00:00 2001 From: danlu1 Date: Thu, 23 May 2024 23:43:31 +0000 Subject: [PATCH 5/9] remove Project GENIE data analysis group since it no longer exists --- genie/database_to_staging.py | 1 - 1 file changed, 1 deletion(-) diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py index d4ca65e2..c52a23f2 100644 --- a/genie/database_to_staging.py +++ b/genie/database_to_staging.py @@ -1896,7 +1896,6 @@ def create_link_version( ] if clinical_ent: # Set private permission for the data_clinical.txt link - syn.setPermissions(clinical_ent[0], principalId=3346558, accessType=[]) syn.setPermissions(clinical_ent[0], principalId=3326313, accessType=[]) for ents in case_list_entities: From 39fcdbdd36bcc6a67c69a5b78d602157eb4e40b4 Mon Sep 17 00:00:00 2001 From: danlu1 Date: Fri, 24 May 2024 01:43:15 +0000 Subject: [PATCH 6/9] replace hard code in public release code --- genie/consortium_to_public.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/genie/consortium_to_public.py b/genie/consortium_to_public.py index 1e307a1d..2b92849a 100644 --- a/genie/consortium_to_public.py +++ b/genie/consortium_to_public.py @@ -139,10 +139,11 @@ def consortiumToPublic( ) # Clinical release scope filter - # If consortium -> Don't release to public - # TODO: check why this synapse id is hard coded? + clinical_tier_release_scope_synid = databaseSynIdMappingDf["Id"][ + databaseSynIdMappingDf["Database"] == "clinical_tier_release_scope" + ][0] publicRelease = extract.get_syntabledf( - syn=syn, query_string="SELECT * FROM syn8545211 where releaseScope = 'public'" + syn=syn, query_string=f"SELECT * FROM {clinical_tier_release_scope_synid} where releaseScope = 'public'" ) allClin = clinicalDf[clinicalDf["SAMPLE_ID"].isin(publicReleaseSamples)] @@ -186,7 +187,10 @@ def consortiumToPublic( ) # Grab mapping table to fill in clinical headers - mapping = extract.get_syntabledf(syn=syn, query_string="SELECT * FROM syn9621600") + clinical_code_to_desc_map_synid = databaseSynIdMappingDf["Id"][ + databaseSynIdMappingDf["Database"] == "clinical_code_to_desc_map" + ][0] + mapping = extract.get_syntabledf(syn=syn, query_string=f"SELECT * FROM {clinical_code_to_desc_map_synid}") genePanelEntities = [] for entName, entId in consortiumRelease[2]: is_deprecated_file = entName in ["data_fusions.txt"] From a4578ee00b0f5195c2590efbf8f9d4ced84d3236 Mon Sep 17 00:00:00 2001 From: danlu1 Date: Fri, 24 May 2024 01:46:58 +0000 Subject: [PATCH 7/9] reformat genie/consortium_to_public.py --- genie/consortium_to_public.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/genie/consortium_to_public.py b/genie/consortium_to_public.py index 2b92849a..eead602c 100644 --- a/genie/consortium_to_public.py +++ b/genie/consortium_to_public.py @@ -143,7 +143,8 @@ def consortiumToPublic( databaseSynIdMappingDf["Database"] == "clinical_tier_release_scope" ][0] publicRelease = extract.get_syntabledf( - syn=syn, query_string=f"SELECT * FROM {clinical_tier_release_scope_synid} where releaseScope = 'public'" + syn=syn, + query_string=f"SELECT * FROM {clinical_tier_release_scope_synid} where releaseScope = 'public'", ) allClin = clinicalDf[clinicalDf["SAMPLE_ID"].isin(publicReleaseSamples)] @@ -190,7 +191,9 @@ def consortiumToPublic( clinical_code_to_desc_map_synid = databaseSynIdMappingDf["Id"][ databaseSynIdMappingDf["Database"] == "clinical_code_to_desc_map" ][0] - mapping = extract.get_syntabledf(syn=syn, query_string=f"SELECT * FROM {clinical_code_to_desc_map_synid}") + mapping = extract.get_syntabledf( + syn=syn, query_string=f"SELECT * FROM {clinical_code_to_desc_map_synid}" + ) genePanelEntities = [] for entName, entId in consortiumRelease[2]: is_deprecated_file = entName in ["data_fusions.txt"] From d1b472d29adfcb65e6e85e04d3788ad294972c1a Mon Sep 17 00:00:00 2001 From: danlu1 Date: Tue, 28 May 2024 21:55:37 +0000 Subject: [PATCH 8/9] add clinical_tier_release_scope table synid to genie_config --- tests/conftest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/conftest.py b/tests/conftest.py index 90c47769..6879b915 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -77,6 +77,7 @@ def genie_config(): "race_mapping": "syn7434236", "sex_mapping": "syn7434222", "sampletype_mapping": "syn7434273", + "clinical_tier_release_scope": "syn8545211", } return config From bc6b6848e3b4c5c40e314c3724c67b41254bc3a7 Mon Sep 17 00:00:00 2001 From: danlu1 Date: Tue, 28 May 2024 21:55:54 +0000 Subject: [PATCH 9/9] add more tests to the unit test --- tests/test_clinical.py | 107 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 95 insertions(+), 12 deletions(-) diff --git a/tests/test_clinical.py b/tests/test_clinical.py index d5fa73a6..75248e41 100644 --- a/tests/test_clinical.py +++ b/tests/test_clinical.py @@ -9,6 +9,7 @@ import synapseclient from genie import process_functions, validate from genie_registry.clinical import Clinical +import pdb def createMockTable(dataframe): @@ -37,11 +38,32 @@ def table_query_results(*args): ) ) +patientdf = pd.DataFrame( + dict( + fieldName=["PATIENT_ID", "SEX", "PRIMARY_RACE"], + patient=[True, True, True], + sample=[True, False, False], + ) +) +sampledf = pd.DataFrame( + dict( + fieldName=["PATIENT_ID", "SAMPLE_ID"], + patient=[True, False], + sample=[True, True], + ) +) + table_query_results_map = { ("select * from syn7434222",): createMockTable(sexdf), ("select * from syn7434236",): createMockTable(no_nan), ("select * from syn7434242",): createMockTable(no_nan), ("select * from syn7434273",): createMockTable(no_nan), + ( + "select fieldName from syn8545211 where patient is True and inClinicalDb is True", + ): createMockTable(patientdf), + ( + "select fieldName from syn8545211 where sample is True and inClinicalDb is True", + ): createMockTable(sampledf), } json_oncotreeurl = ( @@ -952,23 +974,61 @@ def test_remap_clinical_values_sampletype(): @pytest.mark.parametrize( - "col", ["SEX", "PRIMARY_RACE", "SECONDARY_RACE", "TERTIARY_RACE", "ETHNICITY"] + ("testdf", "expecteddf"), + [ + ( + pd.DataFrame( + { + "SEX": [1, 2, 99], + "PRIMARY_RACE": [1, 2, 99], + "SECONDARY_RACE": [1, 2, 99], + "TERTIARY_RACE": [1, 2, 99], + "ETHNICITY": [1, 2, 99], + } + ), + pd.DataFrame( + { + "SEX": ["Male", "Female", "Unknown"], + "PRIMARY_RACE": ["Male", "Female", "Unknown"], + "SECONDARY_RACE": ["Male", "Female", "Unknown"], + "TERTIARY_RACE": ["Male", "Female", "Unknown"], + "ETHNICITY": ["Male", "Female", "Unknown"], + "ETHNICITY_DETAILED": ["Male", "Female", "Not coded"], + "PRIMARY_RACE_DETAILED": ["Male", "Female", "Not coded"], + "SECONDARY_RACE_DETAILED": ["Male", "Female", "Not coded"], + "SEX_DETAILED": ["Male", "Female", "Not coded"], + "TERTIARY_RACE_DETAILED": ["Male", "Female", "Not coded"], + } + ), + ), + ( + pd.DataFrame({"SEX": [1, 2, 99], "PRIMARY_RACE": [1, 2, 99]}), + pd.DataFrame( + { + "SEX": ["Male", "Female", "Unknown"], + "PRIMARY_RACE": ["Male", "Female", "Unknown"], + "PRIMARY_RACE_DETAILED": ["Male", "Female", "Not coded"], + "SEX_DETAILED": ["Male", "Female", "Not coded"], + } + ), + ), + ( + pd.DataFrame({"CENTER": [1, 2, 99]}), + pd.DataFrame( + { + "CENTER": [1, 2, 99], + } + ), + ), + ], + ids=["all_detailed_columns", "some_detailed_columns", "no_detailed_columns"], ) -def test_remap_clinical_values(col): +def test_remap_clinical_values(testdf, expecteddf): """Test Remapping clinical values""" - testdf = pd.DataFrame({"SEX": [1, 2, 99], "PRIMARY_RACE": [1, 2, 99]}) - expecteddf = pd.DataFrame( - { - "SEX": ["Male", "Female", "Unknown"], - "PRIMARY_RACE": ["Male", "Female", "Unknown"], - "PRIMARY_RACE_DETAILED": ["Male", "Female", "Not coded"], - "SEX_DETAILED": ["Male", "Female", "Not coded"], - } - ) remappeddf = genie_registry.clinical.remap_clinical_values( testdf, sexdf, sexdf, sexdf, sexdf ) - assert expecteddf.equals(remappeddf) + assert expecteddf.sort_index(axis=1).equals(remappeddf.sort_index(axis=1)) def test__check_int_year_consistency_valid(): @@ -1562,3 +1622,26 @@ def test_that__cross_validate_assay_info_has_seq_returns_expected_msg_if_valid( ) assert warnings == expected_warning assert errors == expected_error + + +def test_preprocess(clin_class, newpath=None): + """Test preprocess function""" + expected = { + "clinicalTemplate": pd.DataFrame( + columns=["PATIENT_ID", "SEX", "PRIMARY_RACE", "SAMPLE_ID"] + ), + "sample": True, + "patient": True, + "patientCols": ["PATIENT_ID", "SEX", "PRIMARY_RACE"], + "sampleCols": ["PATIENT_ID", "SAMPLE_ID"], + } + results = clin_class.preprocess(newpath) + assert ( + results["clinicalTemplate"] + .sort_index(axis=1) + .equals(expected["clinicalTemplate"].sort_index(axis=1)) + ) + assert results["sample"] == expected["sample"] + assert results["patient"] == expected["patient"] + assert results["patientCols"] == expected["patientCols"] + assert results["sampleCols"] == expected["sampleCols"]