diff --git a/genie/consortium_to_public.py b/genie/consortium_to_public.py index 1e307a1d..eead602c 100644 --- a/genie/consortium_to_public.py +++ b/genie/consortium_to_public.py @@ -139,10 +139,12 @@ def consortiumToPublic( ) # Clinical release scope filter - # If consortium -> Don't release to public - # TODO: check why this synapse id is hard coded? + clinical_tier_release_scope_synid = databaseSynIdMappingDf["Id"][ + databaseSynIdMappingDf["Database"] == "clinical_tier_release_scope" + ][0] publicRelease = extract.get_syntabledf( - syn=syn, query_string="SELECT * FROM syn8545211 where releaseScope = 'public'" + syn=syn, + query_string=f"SELECT * FROM {clinical_tier_release_scope_synid} where releaseScope = 'public'", ) allClin = clinicalDf[clinicalDf["SAMPLE_ID"].isin(publicReleaseSamples)] @@ -186,7 +188,12 @@ def consortiumToPublic( ) # Grab mapping table to fill in clinical headers - mapping = extract.get_syntabledf(syn=syn, query_string="SELECT * FROM syn9621600") + clinical_code_to_desc_map_synid = databaseSynIdMappingDf["Id"][ + databaseSynIdMappingDf["Database"] == "clinical_code_to_desc_map" + ][0] + mapping = extract.get_syntabledf( + syn=syn, query_string=f"SELECT * FROM {clinical_code_to_desc_map_synid}" + ) genePanelEntities = [] for entName, entId in consortiumRelease[2]: is_deprecated_file = entName in ["data_fusions.txt"] diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py index c7e1b7c3..c52a23f2 100644 --- a/genie/database_to_staging.py +++ b/genie/database_to_staging.py @@ -1013,6 +1013,7 @@ def store_clinical_files( release_synid, current_release_staging, center_mappingdf, + databaseSynIdMappingDf, used=None, ): """ @@ -1030,6 +1031,7 @@ def store_clinical_files( release_synid: Synapse id to store release file current_release_staging: Staging flag center_mappingdf: Center mapping dataframe + databaseSynIdMappingDf: Database to Synapse Id mapping Returns: pandas.DataFrame: configured clinical dataframe @@ -1154,7 +1156,12 @@ def store_clinical_files( keep_merged_consortium_samples = clinicaldf.SAMPLE_ID # This mapping table is the GENIE clinical code to description # mapping to generate the headers of the clinical file - mapping = extract.get_syntabledf(syn=syn, query_string="SELECT * FROM syn9621600") + clinical_code_to_desc_map_synid = databaseSynIdMappingDf["Id"][ + databaseSynIdMappingDf["Database"] == "clinical_code_to_desc_map" + ][0] + mapping = extract.get_syntabledf( + syn=syn, query_string=f"SELECT * FROM {clinical_code_to_desc_map_synid}" + ) clinical_path = os.path.join(GENIE_RELEASE_DIR, "data_clinical.txt") clinical_sample_path = os.path.join(GENIE_RELEASE_DIR, "data_clinical_sample.txt") clinical_patient_path = os.path.join(GENIE_RELEASE_DIR, "data_clinical_patient.txt") @@ -1564,6 +1571,9 @@ def stagingToCbio( sv_synid = databaseSynIdMappingDf["Id"][databaseSynIdMappingDf["Database"] == "sv"][ 0 ] + clinical_tier_release_scope_synid = databaseSynIdMappingDf["Id"][ + databaseSynIdMappingDf["Database"] == "clinical_tier_release_scope" + ][0] # Grab assay information assay_info_ind = databaseSynIdMappingDf["Database"] == "assayinfo" assay_info_synid = databaseSynIdMappingDf["Id"][assay_info_ind][0] @@ -1592,7 +1602,8 @@ def stagingToCbio( # Clinical release scope filter # If private -> Don't release to public clinicalReleaseScopeDf = extract.get_syntabledf( - syn, "SELECT * FROM syn8545211 where releaseScope <> 'private'" + syn, + f"SELECT * FROM {clinical_tier_release_scope_synid} where releaseScope <> 'private'", ) patientCols = clinicalReleaseScopeDf["fieldName"][ @@ -1655,6 +1666,7 @@ def stagingToCbio( consortiumReleaseSynId, current_release_staging, CENTER_MAPPING_DF, + databaseSynIdMappingDf, used=[sample_used, patient_used], ) @@ -1884,7 +1896,6 @@ def create_link_version( ] if clinical_ent: # Set private permission for the data_clinical.txt link - syn.setPermissions(clinical_ent[0], principalId=3346558, accessType=[]) syn.setPermissions(clinical_ent[0], principalId=3326313, accessType=[]) for ents in case_list_entities: diff --git a/genie_registry/clinical.py b/genie_registry/clinical.py index 3468ba7b..bb08f5dd 100644 --- a/genie_registry/clinical.py +++ b/genie_registry/clinical.py @@ -306,8 +306,16 @@ def remap_clinical_values( sampletype_mapping.index = sampletype_mapping["CODE"] sampletype_dict = sampletype_mapping.to_dict() - if clinicaldf.get("SAMPLE_TYPE") is not None: - clinicaldf["SAMPLE_TYPE_DETAILED"] = clinicaldf["SAMPLE_TYPE"] + for column in [ + "PRIMARY_RACE", + "SECONDARY_RACE", + "TERTIARY_RACE", + "SEX", + "ETHNICITY", + "SAMPLE_TYPE", + ]: + if column in clinicaldf.columns: + clinicaldf[f"{column}_DETAILED"] = clinicaldf[column] # Use pandas mapping feature clinicaldf = clinicaldf.replace( @@ -316,9 +324,14 @@ def remap_clinical_values( "SECONDARY_RACE": race_dict["CBIO_LABEL"], "TERTIARY_RACE": race_dict["CBIO_LABEL"], "SAMPLE_TYPE": sampletype_dict["CBIO_LABEL"], - "SAMPLE_TYPE_DETAILED": sampletype_dict["DESCRIPTION"], "SEX": sex_dict["CBIO_LABEL"], "ETHNICITY": ethnicity_dict["CBIO_LABEL"], + "PRIMARY_RACE_DETAILED": race_dict["DESCRIPTION"], + "SECONDARY_RACE_DETAILED": race_dict["DESCRIPTION"], + "TERTIARY_RACE_DETAILED": race_dict["DESCRIPTION"], + "SAMPLE_TYPE_DETAILED": sampletype_dict["DESCRIPTION"], + "SEX_DETAILED": sex_dict["DESCRIPTION"], + "ETHNICITY_DETAILED": ethnicity_dict["DESCRIPTION"], } ) @@ -481,12 +494,12 @@ def preprocess(self, newpath): # hardcoded because it never changes # TODO: Add clinical tier release scope to GENIE config patient_cols_table = self.syn.tableQuery( - "select fieldName from syn8545211 where " + f"select fieldName from {self.genie_config['clinical_tier_release_scope']} where " "patient is True and inClinicalDb is True" ) patient_cols = patient_cols_table.asDataFrame()["fieldName"].tolist() sample_cols_table = self.syn.tableQuery( - "select fieldName from syn8545211 where " + f"select fieldName from {self.genie_config['clinical_tier_release_scope']} where " "sample is True and inClinicalDb is True" ) sample_cols = sample_cols_table.asDataFrame()["fieldName"].tolist() diff --git a/tests/conftest.py b/tests/conftest.py index 90c47769..6879b915 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -77,6 +77,7 @@ def genie_config(): "race_mapping": "syn7434236", "sex_mapping": "syn7434222", "sampletype_mapping": "syn7434273", + "clinical_tier_release_scope": "syn8545211", } return config diff --git a/tests/test_clinical.py b/tests/test_clinical.py index 387023bd..75248e41 100644 --- a/tests/test_clinical.py +++ b/tests/test_clinical.py @@ -9,6 +9,7 @@ import synapseclient from genie import process_functions, validate from genie_registry.clinical import Clinical +import pdb def createMockTable(dataframe): @@ -37,11 +38,32 @@ def table_query_results(*args): ) ) +patientdf = pd.DataFrame( + dict( + fieldName=["PATIENT_ID", "SEX", "PRIMARY_RACE"], + patient=[True, True, True], + sample=[True, False, False], + ) +) +sampledf = pd.DataFrame( + dict( + fieldName=["PATIENT_ID", "SAMPLE_ID"], + patient=[True, False], + sample=[True, True], + ) +) + table_query_results_map = { ("select * from syn7434222",): createMockTable(sexdf), ("select * from syn7434236",): createMockTable(no_nan), ("select * from syn7434242",): createMockTable(no_nan), ("select * from syn7434273",): createMockTable(no_nan), + ( + "select fieldName from syn8545211 where patient is True and inClinicalDb is True", + ): createMockTable(patientdf), + ( + "select fieldName from syn8545211 where sample is True and inClinicalDb is True", + ): createMockTable(sampledf), } json_oncotreeurl = ( @@ -952,16 +974,61 @@ def test_remap_clinical_values_sampletype(): @pytest.mark.parametrize( - "col", ["SEX", "PRIMARY_RACE", "SECONDARY_RACE", "TERTIARY_RACE", "ETHNICITY"] + ("testdf", "expecteddf"), + [ + ( + pd.DataFrame( + { + "SEX": [1, 2, 99], + "PRIMARY_RACE": [1, 2, 99], + "SECONDARY_RACE": [1, 2, 99], + "TERTIARY_RACE": [1, 2, 99], + "ETHNICITY": [1, 2, 99], + } + ), + pd.DataFrame( + { + "SEX": ["Male", "Female", "Unknown"], + "PRIMARY_RACE": ["Male", "Female", "Unknown"], + "SECONDARY_RACE": ["Male", "Female", "Unknown"], + "TERTIARY_RACE": ["Male", "Female", "Unknown"], + "ETHNICITY": ["Male", "Female", "Unknown"], + "ETHNICITY_DETAILED": ["Male", "Female", "Not coded"], + "PRIMARY_RACE_DETAILED": ["Male", "Female", "Not coded"], + "SECONDARY_RACE_DETAILED": ["Male", "Female", "Not coded"], + "SEX_DETAILED": ["Male", "Female", "Not coded"], + "TERTIARY_RACE_DETAILED": ["Male", "Female", "Not coded"], + } + ), + ), + ( + pd.DataFrame({"SEX": [1, 2, 99], "PRIMARY_RACE": [1, 2, 99]}), + pd.DataFrame( + { + "SEX": ["Male", "Female", "Unknown"], + "PRIMARY_RACE": ["Male", "Female", "Unknown"], + "PRIMARY_RACE_DETAILED": ["Male", "Female", "Not coded"], + "SEX_DETAILED": ["Male", "Female", "Not coded"], + } + ), + ), + ( + pd.DataFrame({"CENTER": [1, 2, 99]}), + pd.DataFrame( + { + "CENTER": [1, 2, 99], + } + ), + ), + ], + ids=["all_detailed_columns", "some_detailed_columns", "no_detailed_columns"], ) -def test_remap_clinical_values(col): +def test_remap_clinical_values(testdf, expecteddf): """Test Remapping clinical values""" - testdf = pd.DataFrame({col: [1, 2, 99]}) - expecteddf = pd.DataFrame({col: ["Male", "Female", "Unknown"]}) remappeddf = genie_registry.clinical.remap_clinical_values( testdf, sexdf, sexdf, sexdf, sexdf ) - assert expecteddf.equals(remappeddf) + assert expecteddf.sort_index(axis=1).equals(remappeddf.sort_index(axis=1)) def test__check_int_year_consistency_valid(): @@ -1555,3 +1622,26 @@ def test_that__cross_validate_assay_info_has_seq_returns_expected_msg_if_valid( ) assert warnings == expected_warning assert errors == expected_error + + +def test_preprocess(clin_class, newpath=None): + """Test preprocess function""" + expected = { + "clinicalTemplate": pd.DataFrame( + columns=["PATIENT_ID", "SEX", "PRIMARY_RACE", "SAMPLE_ID"] + ), + "sample": True, + "patient": True, + "patientCols": ["PATIENT_ID", "SEX", "PRIMARY_RACE"], + "sampleCols": ["PATIENT_ID", "SAMPLE_ID"], + } + results = clin_class.preprocess(newpath) + assert ( + results["clinicalTemplate"] + .sort_index(axis=1) + .equals(expected["clinicalTemplate"].sort_index(axis=1)) + ) + assert results["sample"] == expected["sample"] + assert results["patient"] == expected["patient"] + assert results["patientCols"] == expected["patientCols"] + assert results["sampleCols"] == expected["sampleCols"]