From ab7e882a00e9c44cf2f5c51c6e7dd09e50afec22 Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Wed, 15 Nov 2023 17:24:04 -0800 Subject: [PATCH 1/4] fix bed/clinical file validation --- genie_registry/clinical.py | 34 +++++++--- tests/test_clinical.py | 132 ++++++++++++++++++++++++------------- 2 files changed, 112 insertions(+), 54 deletions(-) diff --git a/genie_registry/clinical.py b/genie_registry/clinical.py index 014b0b55..b8fba704 100644 --- a/genie_registry/clinical.py +++ b/genie_registry/clinical.py @@ -1012,28 +1012,43 @@ def _get_dataframe(self, filePathList): return clinicaldf - def _cross_validate_bed_files_exist(self, clinicaldf) -> tuple: + def _cross_validate_bed_files_exist(self, clinicaldf) -> list: """Check that a bed file exist per SEQ_ASSAY_ID value in clinical file""" - errors = "" - warnings = "" missing_files = [] - seq_assay_ids = clinicaldf["SEQ_ASSAY_ID"].unique().tolist() + exception_params = {"ignore_case": True, "allow_underscore": True} + + # standardize and get unique seq assay ids before searching bed files + seq_assay_ids = set([ + validate.standardize_string_for_validation(sq_id, **exception_params) + for sq_id in clinicaldf["SEQ_ASSAY_ID"].unique() + ]) for seq_assay_id in seq_assay_ids: bed_files = validate.parse_file_info_in_nested_list( nested_list=self.ancillary_files, search_str=f"{seq_assay_id}.bed", # type: ignore[arg-type] - ignore_case=True, - allow_underscore=True, + **exception_params, ) if not bed_files["files"]: missing_files.append(f"{seq_assay_id}.bed") + return missing_files + + def _cross_validate_bed_files_exist_message(self, missing_bed_files : list) -> tuple: + """Gets the warning/error messages given the missing bed files list + + Args: + missing_bed_files (list): list of missing bed files - if missing_files: + Returns: + tuple: error + warning + """ + errors = "" + warnings = "" + if missing_bed_files: errors = ( "At least one SEQ_ASSAY_ID in your clinical file does not have an associated BED file. " "Please update your file(s) to be consistent.\n" - f"Missing BED files: {', '.join(missing_files)}\n" + f"Missing BED files: {', '.join(missing_bed_files)}\n" ) return errors, warnings @@ -1087,7 +1102,8 @@ def _cross_validate(self, clinicaldf) -> tuple: errors_assay, warnings_assay = self._cross_validate_assay_info_has_seq( clinicaldf ) - errors_bed, warnings_bed = self._cross_validate_bed_files_exist(clinicaldf) + missing_bed_files = self._cross_validate_bed_files_exist(clinicaldf) + errors_bed, warnings_bed = self._cross_validate_bed_files_exist_message(missing_bed_files) errors = errors_assay + errors_bed warnings = warnings_assay + warnings_bed diff --git a/tests/test_clinical.py b/tests/test_clinical.py index 92c4ec30..fa9d9be0 100644 --- a/tests/test_clinical.py +++ b/tests/test_clinical.py @@ -1,3 +1,4 @@ +from collections import Counter import datetime import json from unittest import mock @@ -1061,65 +1062,73 @@ def test__check_int_dead_consistency_inconsistent(inconsistent_df): ) -@pytest.mark.parametrize( - "test_clinical_df,test_ancillary_files,expected_error,expected_warning", - [ - ( - pd.DataFrame( - {"SEQ_ASSAY_ID": ["SAGE-1-1", "SAGE-SAGE-1", "SAGE-1", "SAGE-1"]} +def get_cross_validate_bed_files_test_cases(): + return [ + { + "name": "all_match", + "test_clinical_df": pd.DataFrame( + { + "SEQ_ASSAY_ID": [ + "SAGE-1-1", + "SAGE-SAGE-1", + "SAGE-1", + "SAGE-1", + "SaGe-1", + ] + } ), - [ + "test_ancillary_files": [ [{"name": "SAGE-SAGE-1.bed", "path": ""}], [{"name": "SAGE-1-1.bed", "path": ""}], [{"name": "SAGE-1.bed", "path": ""}], ], - "", - "", - ), - ( - pd.DataFrame({"SEQ_ASSAY_ID": ["SAGE-1-1", "SAGE-1-2"]}), - [ + "expected_missing_files": [], + }, + { + "name": "partial_match", + "test_clinical_df": pd.DataFrame( + {"SEQ_ASSAY_ID": ["SAGE-1-1", "SAGE-1-2", "SaGe-1_1"]} + ), + "test_ancillary_files": [ [{"name": "SAGE-SAGE-1.bed", "path": ""}], [{"name": "SAGE-1-1.bed", "path": ""}], [{"name": "SAGE-1.bed", "path": ""}], ], - "At least one SEQ_ASSAY_ID in your clinical file does not have an associated BED file. " - "Please update your file(s) to be consistent.\n" - "Missing BED files: SAGE-1-2.bed\n", - "", - ), - ( - pd.DataFrame({"SEQ_ASSAY_ID": ["SAGE-1-2", "SAGE-1-3"]}), - [ + "expected_missing_files": ["sage-1-2.bed"], + }, + { + "name": "no_match", + "test_clinical_df": pd.DataFrame( + {"SEQ_ASSAY_ID": ["SAGE-1-2", "SAGE-1-3", "SaGe_1_2"]} + ), + "test_ancillary_files": [ [{"name": "SAGE-SAGE-1.bed", "path": ""}], [{"name": "SAGE-1-1.bed", "path": ""}], [{"name": "SAGE-1.bed", "path": ""}], ], - "At least one SEQ_ASSAY_ID in your clinical file does not have an associated BED file. " - "Please update your file(s) to be consistent.\n" - "Missing BED files: SAGE-1-2.bed, SAGE-1-3.bed\n", - "", - ), - ( - pd.DataFrame({"SEQ_ASSAY_ID": ["SAGE-1-2", "SAGE-1-3"]}), - [ + "expected_missing_files": ["sage-1-2.bed", "sage-1-3.bed"], + }, + { + "name": "no_bed_files", + "test_clinical_df": pd.DataFrame( + {"SEQ_ASSAY_ID": ["SAGE-1-2", "SAGE-1-3", "SAge-1_2"]} + ), + "test_ancillary_files": [ [{"name": "SAGE-1.txt", "path": ""}], ], - "At least one SEQ_ASSAY_ID in your clinical file does not have an associated BED file. " - "Please update your file(s) to be consistent.\n" - "Missing BED files: SAGE-1-2.bed, SAGE-1-3.bed\n", - "", - ), - ], - ids=["all_match", "partial_match", "no_match", "no_bed_files"], + "expected_missing_files": ["sage-1-2.bed", "sage-1-3.bed"], + }, + ] + +@pytest.mark.parametrize( + "test_cases", get_cross_validate_bed_files_test_cases(), ids=lambda x: x["name"] ) def test_that_cross_validate_bed_files_exist_returns_correct_msgs( - clin_class, test_clinical_df, test_ancillary_files, expected_error, expected_warning + clin_class, test_cases ): - clin_class.ancillary_files = test_ancillary_files - errors, warnings = clin_class._cross_validate_bed_files_exist(test_clinical_df) - assert errors == expected_error - assert warnings == expected_warning + clin_class.ancillary_files = test_cases["test_ancillary_files"] + missing_files = clin_class._cross_validate_bed_files_exist(test_cases["test_clinical_df"]) + assert Counter(test_cases["expected_missing_files"]) == Counter(missing_files) def test_that_cross_validate_bed_files_exist_calls_expected_methods(clin_class): @@ -1138,29 +1147,62 @@ def test_that_cross_validate_bed_files_exist_calls_expected_methods(clin_class): clin_class._cross_validate_bed_files_exist(test_clinical_df) patch_parse_file_info.assert_called_once_with( nested_list=clin_class.ancillary_files, - search_str="SAGE-SAGE-1.bed", + search_str="sage-sage-1.bed", ignore_case=True, allow_underscore=True, ) +@pytest.mark.parametrize( + "missing_files,expected_error,expected_warning", + [ + ( + [], + "", + "", + ), + ( + ["test1.bed", "test2.bed"], + "At least one SEQ_ASSAY_ID in your clinical file does not have an associated BED file. " + "Please update your file(s) to be consistent.\n" + "Missing BED files: test1.bed, test2.bed\n", + "", + ), + ], + ids=["no_missing_files", "missing_files"], +) +def test_that_cross_validate_bed_files_exist_message_returns_correct_msgs( + clin_class, missing_files, expected_error, expected_warning +): + errors, warnings = clin_class._cross_validate_bed_files_exist_message(missing_files) + assert errors == expected_error + assert warnings == expected_warning + + def test_that__cross_validate_calls_expected_methods(clin_class): with mock.patch.object( Clinical, "_cross_validate_assay_info_has_seq", return_value=("", "") ) as patch__cross_validate_assay, mock.patch.object( Clinical, "_cross_validate_bed_files_exist", return_value=("", "") - ) as patch__cross_validate_bed: + ) as patch__cross_validate_bed, mock.patch.object( + Clinical, "_cross_validate_bed_files_exist_message", return_value=("", "") + ) as patch__cross_validate_bed_msg: clin_class._cross_validate(clinicaldf=pd.DataFrame({"something": [1]})) patch__cross_validate_assay.assert_called_once() patch__cross_validate_bed.assert_called_once() + patch__cross_validate_bed_msg.assert_called_once() def test_that__cross_validate_returns_correct_format_for_errors_warnings(clin_class): with mock.patch.object( Clinical, "_cross_validate_assay_info_has_seq", return_value=("test1", "") ) as patch__cross_validate_assay, mock.patch.object( - Clinical, "_cross_validate_bed_files_exist", return_value=("test3\n", "") - ) as patch__cross_validate_bed: + Clinical, "_cross_validate_bed_files_exist", return_value=["something_missing"] + ) as patch__cross_validate_bed, mock.patch.object( + Clinical, + "_cross_validate_bed_files_exist_message", + return_value=("test3\n", ""), + ) as patch__cross_validate_bed_msg: errors, warnings = clin_class._cross_validate( clinicaldf=pd.DataFrame({"something": [1]}) ) From 667d48b4b65727dca4ebb87997c92858c0ca9847 Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Wed, 15 Nov 2023 17:34:27 -0800 Subject: [PATCH 2/4] lint --- tests/test_clinical.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/test_clinical.py b/tests/test_clinical.py index fa9d9be0..1c09c106 100644 --- a/tests/test_clinical.py +++ b/tests/test_clinical.py @@ -1120,6 +1120,7 @@ def get_cross_validate_bed_files_test_cases(): }, ] + @pytest.mark.parametrize( "test_cases", get_cross_validate_bed_files_test_cases(), ids=lambda x: x["name"] ) @@ -1127,7 +1128,9 @@ def test_that_cross_validate_bed_files_exist_returns_correct_msgs( clin_class, test_cases ): clin_class.ancillary_files = test_cases["test_ancillary_files"] - missing_files = clin_class._cross_validate_bed_files_exist(test_cases["test_clinical_df"]) + missing_files = clin_class._cross_validate_bed_files_exist( + test_cases["test_clinical_df"] + ) assert Counter(test_cases["expected_missing_files"]) == Counter(missing_files) From f48037ed3b372a8e286cabe19f9712f6b7ed4bc6 Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Wed, 15 Nov 2023 20:44:02 -0800 Subject: [PATCH 3/4] lint clinical --- genie_registry/clinical.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/genie_registry/clinical.py b/genie_registry/clinical.py index b8fba704..d649bee7 100644 --- a/genie_registry/clinical.py +++ b/genie_registry/clinical.py @@ -1016,12 +1016,14 @@ def _cross_validate_bed_files_exist(self, clinicaldf) -> list: """Check that a bed file exist per SEQ_ASSAY_ID value in clinical file""" missing_files = [] exception_params = {"ignore_case": True, "allow_underscore": True} - + # standardize and get unique seq assay ids before searching bed files - seq_assay_ids = set([ - validate.standardize_string_for_validation(sq_id, **exception_params) - for sq_id in clinicaldf["SEQ_ASSAY_ID"].unique() - ]) + seq_assay_ids = set( + [ + validate.standardize_string_for_validation(sq_id, **exception_params) + for sq_id in clinicaldf["SEQ_ASSAY_ID"].unique() + ] + ) for seq_assay_id in seq_assay_ids: bed_files = validate.parse_file_info_in_nested_list( @@ -1033,7 +1035,7 @@ def _cross_validate_bed_files_exist(self, clinicaldf) -> list: missing_files.append(f"{seq_assay_id}.bed") return missing_files - def _cross_validate_bed_files_exist_message(self, missing_bed_files : list) -> tuple: + def _cross_validate_bed_files_exist_message(self, missing_bed_files: list) -> tuple: """Gets the warning/error messages given the missing bed files list Args: @@ -1103,7 +1105,9 @@ def _cross_validate(self, clinicaldf) -> tuple: clinicaldf ) missing_bed_files = self._cross_validate_bed_files_exist(clinicaldf) - errors_bed, warnings_bed = self._cross_validate_bed_files_exist_message(missing_bed_files) + errors_bed, warnings_bed = self._cross_validate_bed_files_exist_message( + missing_bed_files + ) errors = errors_assay + errors_bed warnings = warnings_assay + warnings_bed From f9f3e4208574cb381a3161d64722f73825eacfd3 Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Wed, 15 Nov 2023 21:07:56 -0800 Subject: [PATCH 4/4] update bed file case for msg formatting purposes --- genie_registry/clinical.py | 2 +- tests/test_clinical.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/genie_registry/clinical.py b/genie_registry/clinical.py index d649bee7..e20dac75 100644 --- a/genie_registry/clinical.py +++ b/genie_registry/clinical.py @@ -1032,7 +1032,7 @@ def _cross_validate_bed_files_exist(self, clinicaldf) -> list: **exception_params, ) if not bed_files["files"]: - missing_files.append(f"{seq_assay_id}.bed") + missing_files.append(f"{seq_assay_id.upper()}.bed") return missing_files def _cross_validate_bed_files_exist_message(self, missing_bed_files: list) -> tuple: diff --git a/tests/test_clinical.py b/tests/test_clinical.py index 1c09c106..91815c97 100644 --- a/tests/test_clinical.py +++ b/tests/test_clinical.py @@ -1094,7 +1094,7 @@ def get_cross_validate_bed_files_test_cases(): [{"name": "SAGE-1-1.bed", "path": ""}], [{"name": "SAGE-1.bed", "path": ""}], ], - "expected_missing_files": ["sage-1-2.bed"], + "expected_missing_files": ["SAGE-1-2.bed"], }, { "name": "no_match", @@ -1106,7 +1106,7 @@ def get_cross_validate_bed_files_test_cases(): [{"name": "SAGE-1-1.bed", "path": ""}], [{"name": "SAGE-1.bed", "path": ""}], ], - "expected_missing_files": ["sage-1-2.bed", "sage-1-3.bed"], + "expected_missing_files": ["SAGE-1-2.bed", "SAGE-1-3.bed"], }, { "name": "no_bed_files", @@ -1116,7 +1116,7 @@ def get_cross_validate_bed_files_test_cases(): "test_ancillary_files": [ [{"name": "SAGE-1.txt", "path": ""}], ], - "expected_missing_files": ["sage-1-2.bed", "sage-1-3.bed"], + "expected_missing_files": ["SAGE-1-2.bed", "SAGE-1-3.bed"], }, ]