Sage-Bionetworks · rxu17 · Nov 16, 2023 · Nov 16, 2023 · Nov 16, 2023 · Nov 16, 2023
@@ -1012,28 +1012,45 @@ def _get_dataframe(self, filePathList):
 
         return clinicaldf
 
-    def _cross_validate_bed_files_exist(self, clinicaldf) -> tuple:
+    def _cross_validate_bed_files_exist(self, clinicaldf) -> list:
         """Check that a bed file exist per SEQ_ASSAY_ID value in clinical file"""
-        errors = ""
-        warnings = ""
         missing_files = []
-        seq_assay_ids = clinicaldf["SEQ_ASSAY_ID"].unique().tolist()
+        exception_params = {"ignore_case": True, "allow_underscore": True}
+
+        # standardize and get unique seq assay ids before searching bed files
+        seq_assay_ids = set(
+            [
+                validate.standardize_string_for_validation(sq_id, **exception_params)
+                for sq_id in clinicaldf["SEQ_ASSAY_ID"].unique()
+            ]
+        )
 
         for seq_assay_id in seq_assay_ids:
             bed_files = validate.parse_file_info_in_nested_list(
                 nested_list=self.ancillary_files,
                 search_str=f"{seq_assay_id}.bed",  # type: ignore[arg-type]
-                ignore_case=True,
-                allow_underscore=True,
+                **exception_params,
             )
             if not bed_files["files"]:
-                missing_files.append(f"{seq_assay_id}.bed")
+                missing_files.append(f"{seq_assay_id.upper()}.bed")
+        return missing_files
+
+    def _cross_validate_bed_files_exist_message(self, missing_bed_files: list) -> tuple:
+        """Gets the warning/error messages given the missing bed files list
 
-        if missing_files:
+        Args:
+            missing_bed_files (list): list of missing bed files
+
+        Returns:
+            tuple: error + warning
+        """
+        errors = ""
+        warnings = ""
+        if missing_bed_files:
             errors = (
                 "At least one SEQ_ASSAY_ID in your clinical file does not have an associated BED file. "
                 "Please update your file(s) to be consistent.\n"
-                f"Missing BED files: {', '.join(missing_files)}\n"
+                f"Missing BED files: {', '.join(missing_bed_files)}\n"
             )
         return errors, warnings
 
@@ -1087,7 +1104,10 @@ def _cross_validate(self, clinicaldf) -> tuple:
         errors_assay, warnings_assay = self._cross_validate_assay_info_has_seq(
             clinicaldf
         )
-        errors_bed, warnings_bed = self._cross_validate_bed_files_exist(clinicaldf)
+        missing_bed_files = self._cross_validate_bed_files_exist(clinicaldf)
+        errors_bed, warnings_bed = self._cross_validate_bed_files_exist_message(
+            missing_bed_files
+        )
 
         errors = errors_assay + errors_bed
         warnings = warnings_assay + warnings_bed

@@ -1,3 +1,4 @@
+from collections import Counter
 import datetime
 import json
 from unittest import mock
@@ -1061,65 +1062,76 @@ def test__check_int_dead_consistency_inconsistent(inconsistent_df):
     )
 
 
-@pytest.mark.parametrize(
-    "test_clinical_df,test_ancillary_files,expected_error,expected_warning",
-    [
-        (
-            pd.DataFrame(
-                {"SEQ_ASSAY_ID": ["SAGE-1-1", "SAGE-SAGE-1", "SAGE-1", "SAGE-1"]}
+def get_cross_validate_bed_files_test_cases():
+    return [
+        {
+            "name": "all_match",
+            "test_clinical_df": pd.DataFrame(
+                {
+                    "SEQ_ASSAY_ID": [
+                        "SAGE-1-1",
+                        "SAGE-SAGE-1",
+                        "SAGE-1",
+                        "SAGE-1",
+                        "SaGe-1",
+                    ]
+                }
             ),
-            [
+            "test_ancillary_files": [
                 [{"name": "SAGE-SAGE-1.bed", "path": ""}],
                 [{"name": "SAGE-1-1.bed", "path": ""}],
                 [{"name": "SAGE-1.bed", "path": ""}],
             ],
-            "",
-            "",
-        ),
-        (
-            pd.DataFrame({"SEQ_ASSAY_ID": ["SAGE-1-1", "SAGE-1-2"]}),
-            [
+            "expected_missing_files": [],
+        },
+        {
+            "name": "partial_match",
+            "test_clinical_df": pd.DataFrame(
+                {"SEQ_ASSAY_ID": ["SAGE-1-1", "SAGE-1-2", "SaGe-1_1"]}
+            ),
+            "test_ancillary_files": [
                 [{"name": "SAGE-SAGE-1.bed", "path": ""}],
                 [{"name": "SAGE-1-1.bed", "path": ""}],
                 [{"name": "SAGE-1.bed", "path": ""}],
             ],
-            "At least one SEQ_ASSAY_ID in your clinical file does not have an associated BED file. "
-            "Please update your file(s) to be consistent.\n"
-            "Missing BED files: SAGE-1-2.bed\n",
-            "",
-        ),
-        (
-            pd.DataFrame({"SEQ_ASSAY_ID": ["SAGE-1-2", "SAGE-1-3"]}),
-            [
+            "expected_missing_files": ["SAGE-1-2.bed"],
+        },
+        {
+            "name": "no_match",
+            "test_clinical_df": pd.DataFrame(
+                {"SEQ_ASSAY_ID": ["SAGE-1-2", "SAGE-1-3", "SaGe_1_2"]}
+            ),
+            "test_ancillary_files": [
                 [{"name": "SAGE-SAGE-1.bed", "path": ""}],
                 [{"name": "SAGE-1-1.bed", "path": ""}],
                 [{"name": "SAGE-1.bed", "path": ""}],
             ],
-            "At least one SEQ_ASSAY_ID in your clinical file does not have an associated BED file. "
-            "Please update your file(s) to be consistent.\n"
-            "Missing BED files: SAGE-1-2.bed, SAGE-1-3.bed\n",
-            "",
-        ),
-        (
-            pd.DataFrame({"SEQ_ASSAY_ID": ["SAGE-1-2", "SAGE-1-3"]}),
-            [
+            "expected_missing_files": ["SAGE-1-2.bed", "SAGE-1-3.bed"],
+        },
+        {
+            "name": "no_bed_files",
+            "test_clinical_df": pd.DataFrame(
+                {"SEQ_ASSAY_ID": ["SAGE-1-2", "SAGE-1-3", "SAge-1_2"]}
+            ),
+            "test_ancillary_files": [
                 [{"name": "SAGE-1.txt", "path": ""}],
             ],
-            "At least one SEQ_ASSAY_ID in your clinical file does not have an associated BED file. "
-            "Please update your file(s) to be consistent.\n"
-            "Missing BED files: SAGE-1-2.bed, SAGE-1-3.bed\n",
-            "",
-        ),
-    ],
-    ids=["all_match", "partial_match", "no_match", "no_bed_files"],
+            "expected_missing_files": ["SAGE-1-2.bed", "SAGE-1-3.bed"],
+        },
+    ]
+
+
+@pytest.mark.parametrize(
+    "test_cases", get_cross_validate_bed_files_test_cases(), ids=lambda x: x["name"]
 )
 def test_that_cross_validate_bed_files_exist_returns_correct_msgs(
-    clin_class, test_clinical_df, test_ancillary_files, expected_error, expected_warning
+    clin_class, test_cases
 ):
-    clin_class.ancillary_files = test_ancillary_files
-    errors, warnings = clin_class._cross_validate_bed_files_exist(test_clinical_df)
-    assert errors == expected_error
-    assert warnings == expected_warning
+    clin_class.ancillary_files = test_cases["test_ancillary_files"]
+    missing_files = clin_class._cross_validate_bed_files_exist(
+        test_cases["test_clinical_df"]
+    )
+    assert Counter(test_cases["expected_missing_files"]) == Counter(missing_files)
 
 
 def test_that_cross_validate_bed_files_exist_calls_expected_methods(clin_class):
@@ -1138,29 +1150,62 @@ def test_that_cross_validate_bed_files_exist_calls_expected_methods(clin_class):
         clin_class._cross_validate_bed_files_exist(test_clinical_df)
         patch_parse_file_info.assert_called_once_with(
             nested_list=clin_class.ancillary_files,
-            search_str="SAGE-SAGE-1.bed",
+            search_str="sage-sage-1.bed",
             ignore_case=True,
             allow_underscore=True,
         )
 
 
+@pytest.mark.parametrize(
+    "missing_files,expected_error,expected_warning",
+    [
+        (
+            [],
+            "",
+            "",
+        ),
+        (
+            ["test1.bed", "test2.bed"],
+            "At least one SEQ_ASSAY_ID in your clinical file does not have an associated BED file. "
+            "Please update your file(s) to be consistent.\n"
+            "Missing BED files: test1.bed, test2.bed\n",
+            "",
+        ),
+    ],
+    ids=["no_missing_files", "missing_files"],
+)
+def test_that_cross_validate_bed_files_exist_message_returns_correct_msgs(
+    clin_class, missing_files, expected_error, expected_warning
+):
+    errors, warnings = clin_class._cross_validate_bed_files_exist_message(missing_files)
+    assert errors == expected_error
+    assert warnings == expected_warning
+
+
 def test_that__cross_validate_calls_expected_methods(clin_class):
     with mock.patch.object(
         Clinical, "_cross_validate_assay_info_has_seq", return_value=("", "")
     ) as patch__cross_validate_assay, mock.patch.object(
         Clinical, "_cross_validate_bed_files_exist", return_value=("", "")
-    ) as patch__cross_validate_bed:
+    ) as patch__cross_validate_bed, mock.patch.object(
+        Clinical, "_cross_validate_bed_files_exist_message", return_value=("", "")
+    ) as patch__cross_validate_bed_msg:
         clin_class._cross_validate(clinicaldf=pd.DataFrame({"something": [1]}))
         patch__cross_validate_assay.assert_called_once()
         patch__cross_validate_bed.assert_called_once()
+        patch__cross_validate_bed_msg.assert_called_once()
 
 
 def test_that__cross_validate_returns_correct_format_for_errors_warnings(clin_class):
     with mock.patch.object(
         Clinical, "_cross_validate_assay_info_has_seq", return_value=("test1", "")
     ) as patch__cross_validate_assay, mock.patch.object(
-        Clinical, "_cross_validate_bed_files_exist", return_value=("test3\n", "")
-    ) as patch__cross_validate_bed:
+        Clinical, "_cross_validate_bed_files_exist", return_value=["something_missing"]
+    ) as patch__cross_validate_bed, mock.patch.object(
+        Clinical,
+        "_cross_validate_bed_files_exist_message",
+        return_value=("test3\n", ""),
+    ) as patch__cross_validate_bed_msg:
         errors, warnings = clin_class._cross_validate(
             clinicaldf=pd.DataFrame({"something": [1]})
         )