From ab7e882a00e9c44cf2f5c51c6e7dd09e50afec22 Mon Sep 17 00:00:00 2001
From: rxu17 <26471741+rxu17@users.noreply.github.com>
Date: Wed, 15 Nov 2023 17:24:04 -0800
Subject: [PATCH 1/4] fix bed/clinical file validation

---
 genie_registry/clinical.py |  34 +++++++---
 tests/test_clinical.py     | 132 ++++++++++++++++++++++++-------------
 2 files changed, 112 insertions(+), 54 deletions(-)

diff --git a/genie_registry/clinical.py b/genie_registry/clinical.py
index 014b0b55..b8fba704 100644
--- a/genie_registry/clinical.py
+++ b/genie_registry/clinical.py
@@ -1012,28 +1012,43 @@ def _get_dataframe(self, filePathList):
 
         return clinicaldf
 
-    def _cross_validate_bed_files_exist(self, clinicaldf) -> tuple:
+    def _cross_validate_bed_files_exist(self, clinicaldf) -> list:
         """Check that a bed file exist per SEQ_ASSAY_ID value in clinical file"""
-        errors = ""
-        warnings = ""
         missing_files = []
-        seq_assay_ids = clinicaldf["SEQ_ASSAY_ID"].unique().tolist()
+        exception_params = {"ignore_case": True, "allow_underscore": True}
+        
+        # standardize and get unique seq assay ids before searching bed files
+        seq_assay_ids = set([
+            validate.standardize_string_for_validation(sq_id, **exception_params)
+            for sq_id in clinicaldf["SEQ_ASSAY_ID"].unique()
+        ])
 
         for seq_assay_id in seq_assay_ids:
             bed_files = validate.parse_file_info_in_nested_list(
                 nested_list=self.ancillary_files,
                 search_str=f"{seq_assay_id}.bed",  # type: ignore[arg-type]
-                ignore_case=True,
-                allow_underscore=True,
+                **exception_params,
             )
             if not bed_files["files"]:
                 missing_files.append(f"{seq_assay_id}.bed")
+        return missing_files
+
+    def _cross_validate_bed_files_exist_message(self, missing_bed_files : list) -> tuple:
+        """Gets the warning/error messages given the missing bed files list
+
+        Args:
+            missing_bed_files (list): list of missing bed files
 
-        if missing_files:
+        Returns:
+            tuple: error + warning
+        """
+        errors = ""
+        warnings = ""
+        if missing_bed_files:
             errors = (
                 "At least one SEQ_ASSAY_ID in your clinical file does not have an associated BED file. "
                 "Please update your file(s) to be consistent.\n"
-                f"Missing BED files: {', '.join(missing_files)}\n"
+                f"Missing BED files: {', '.join(missing_bed_files)}\n"
             )
         return errors, warnings
 
@@ -1087,7 +1102,8 @@ def _cross_validate(self, clinicaldf) -> tuple:
         errors_assay, warnings_assay = self._cross_validate_assay_info_has_seq(
             clinicaldf
         )
-        errors_bed, warnings_bed = self._cross_validate_bed_files_exist(clinicaldf)
+        missing_bed_files = self._cross_validate_bed_files_exist(clinicaldf)
+        errors_bed, warnings_bed = self._cross_validate_bed_files_exist_message(missing_bed_files)
 
         errors = errors_assay + errors_bed
         warnings = warnings_assay + warnings_bed
diff --git a/tests/test_clinical.py b/tests/test_clinical.py
index 92c4ec30..fa9d9be0 100644
--- a/tests/test_clinical.py
+++ b/tests/test_clinical.py
@@ -1,3 +1,4 @@
+from collections import Counter
 import datetime
 import json
 from unittest import mock
@@ -1061,65 +1062,73 @@ def test__check_int_dead_consistency_inconsistent(inconsistent_df):
     )
 
 
-@pytest.mark.parametrize(
-    "test_clinical_df,test_ancillary_files,expected_error,expected_warning",
-    [
-        (
-            pd.DataFrame(
-                {"SEQ_ASSAY_ID": ["SAGE-1-1", "SAGE-SAGE-1", "SAGE-1", "SAGE-1"]}
+def get_cross_validate_bed_files_test_cases():
+    return [
+        {
+            "name": "all_match",
+            "test_clinical_df": pd.DataFrame(
+                {
+                    "SEQ_ASSAY_ID": [
+                        "SAGE-1-1",
+                        "SAGE-SAGE-1",
+                        "SAGE-1",
+                        "SAGE-1",
+                        "SaGe-1",
+                    ]
+                }
             ),
-            [
+            "test_ancillary_files": [
                 [{"name": "SAGE-SAGE-1.bed", "path": ""}],
                 [{"name": "SAGE-1-1.bed", "path": ""}],
                 [{"name": "SAGE-1.bed", "path": ""}],
             ],
-            "",
-            "",
-        ),
-        (
-            pd.DataFrame({"SEQ_ASSAY_ID": ["SAGE-1-1", "SAGE-1-2"]}),
-            [
+            "expected_missing_files": [],
+        },
+        {
+            "name": "partial_match",
+            "test_clinical_df": pd.DataFrame(
+                {"SEQ_ASSAY_ID": ["SAGE-1-1", "SAGE-1-2", "SaGe-1_1"]}
+            ),
+            "test_ancillary_files": [
                 [{"name": "SAGE-SAGE-1.bed", "path": ""}],
                 [{"name": "SAGE-1-1.bed", "path": ""}],
                 [{"name": "SAGE-1.bed", "path": ""}],
             ],
-            "At least one SEQ_ASSAY_ID in your clinical file does not have an associated BED file. "
-            "Please update your file(s) to be consistent.\n"
-            "Missing BED files: SAGE-1-2.bed\n",
-            "",
-        ),
-        (
-            pd.DataFrame({"SEQ_ASSAY_ID": ["SAGE-1-2", "SAGE-1-3"]}),
-            [
+            "expected_missing_files": ["sage-1-2.bed"],
+        },
+        {
+            "name": "no_match",
+            "test_clinical_df": pd.DataFrame(
+                {"SEQ_ASSAY_ID": ["SAGE-1-2", "SAGE-1-3", "SaGe_1_2"]}
+            ),
+            "test_ancillary_files": [
                 [{"name": "SAGE-SAGE-1.bed", "path": ""}],
                 [{"name": "SAGE-1-1.bed", "path": ""}],
                 [{"name": "SAGE-1.bed", "path": ""}],
             ],
-            "At least one SEQ_ASSAY_ID in your clinical file does not have an associated BED file. "
-            "Please update your file(s) to be consistent.\n"
-            "Missing BED files: SAGE-1-2.bed, SAGE-1-3.bed\n",
-            "",
-        ),
-        (
-            pd.DataFrame({"SEQ_ASSAY_ID": ["SAGE-1-2", "SAGE-1-3"]}),
-            [
+            "expected_missing_files": ["sage-1-2.bed", "sage-1-3.bed"],
+        },
+        {
+            "name": "no_bed_files",
+            "test_clinical_df": pd.DataFrame(
+                {"SEQ_ASSAY_ID": ["SAGE-1-2", "SAGE-1-3", "SAge-1_2"]}
+            ),
+            "test_ancillary_files": [
                 [{"name": "SAGE-1.txt", "path": ""}],
             ],
-            "At least one SEQ_ASSAY_ID in your clinical file does not have an associated BED file. "
-            "Please update your file(s) to be consistent.\n"
-            "Missing BED files: SAGE-1-2.bed, SAGE-1-3.bed\n",
-            "",
-        ),
-    ],
-    ids=["all_match", "partial_match", "no_match", "no_bed_files"],
+            "expected_missing_files": ["sage-1-2.bed", "sage-1-3.bed"],
+        },
+    ]
+
+@pytest.mark.parametrize(
+    "test_cases", get_cross_validate_bed_files_test_cases(), ids=lambda x: x["name"]
 )
 def test_that_cross_validate_bed_files_exist_returns_correct_msgs(
-    clin_class, test_clinical_df, test_ancillary_files, expected_error, expected_warning
+    clin_class, test_cases
 ):
-    clin_class.ancillary_files = test_ancillary_files
-    errors, warnings = clin_class._cross_validate_bed_files_exist(test_clinical_df)
-    assert errors == expected_error
-    assert warnings == expected_warning
+    clin_class.ancillary_files = test_cases["test_ancillary_files"]
+    missing_files = clin_class._cross_validate_bed_files_exist(test_cases["test_clinical_df"])
+    assert Counter(test_cases["expected_missing_files"]) == Counter(missing_files)
 
 
 def test_that_cross_validate_bed_files_exist_calls_expected_methods(clin_class):
@@ -1138,29 +1147,62 @@ def test_that_cross_validate_bed_files_exist_calls_expected_methods(clin_class):
         clin_class._cross_validate_bed_files_exist(test_clinical_df)
         patch_parse_file_info.assert_called_once_with(
             nested_list=clin_class.ancillary_files,
-            search_str="SAGE-SAGE-1.bed",
+            search_str="sage-sage-1.bed",
             ignore_case=True,
             allow_underscore=True,
         )
 
 
+@pytest.mark.parametrize(
+    "missing_files,expected_error,expected_warning",
+    [
+        (
+            [],
+            "",
+            "",
+        ),
+        (
+            ["test1.bed", "test2.bed"],
+            "At least one SEQ_ASSAY_ID in your clinical file does not have an associated BED file. "
+            "Please update your file(s) to be consistent.\n"
+            "Missing BED files: test1.bed, test2.bed\n",
+            "",
+        ),
+    ],
+    ids=["no_missing_files", "missing_files"],
+)
+def test_that_cross_validate_bed_files_exist_message_returns_correct_msgs(
+    clin_class, missing_files, expected_error, expected_warning
+):
+    errors, warnings = clin_class._cross_validate_bed_files_exist_message(missing_files)
+    assert errors == expected_error
+    assert warnings == expected_warning
+
+
 def test_that__cross_validate_calls_expected_methods(clin_class):
     with mock.patch.object(
         Clinical, "_cross_validate_assay_info_has_seq", return_value=("", "")
     ) as patch__cross_validate_assay, mock.patch.object(
         Clinical, "_cross_validate_bed_files_exist", return_value=("", "")
-    ) as patch__cross_validate_bed:
+    ) as patch__cross_validate_bed, mock.patch.object(
+        Clinical, "_cross_validate_bed_files_exist_message", return_value=("", "")
+    ) as patch__cross_validate_bed_msg:
         clin_class._cross_validate(clinicaldf=pd.DataFrame({"something": [1]}))
         patch__cross_validate_assay.assert_called_once()
         patch__cross_validate_bed.assert_called_once()
+        patch__cross_validate_bed_msg.assert_called_once()
 
 
 def test_that__cross_validate_returns_correct_format_for_errors_warnings(clin_class):
     with mock.patch.object(
         Clinical, "_cross_validate_assay_info_has_seq", return_value=("test1", "")
     ) as patch__cross_validate_assay, mock.patch.object(
-        Clinical, "_cross_validate_bed_files_exist", return_value=("test3\n", "")
-    ) as patch__cross_validate_bed:
+        Clinical, "_cross_validate_bed_files_exist", return_value=["something_missing"]
+    ) as patch__cross_validate_bed, mock.patch.object(
+        Clinical,
+        "_cross_validate_bed_files_exist_message",
+        return_value=("test3\n", ""),
+    ) as patch__cross_validate_bed_msg:
         errors, warnings = clin_class._cross_validate(
             clinicaldf=pd.DataFrame({"something": [1]})
         )

From 667d48b4b65727dca4ebb87997c92858c0ca9847 Mon Sep 17 00:00:00 2001
From: rxu17 <26471741+rxu17@users.noreply.github.com>
Date: Wed, 15 Nov 2023 17:34:27 -0800
Subject: [PATCH 2/4] lint

---
 tests/test_clinical.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/test_clinical.py b/tests/test_clinical.py
index fa9d9be0..1c09c106 100644
--- a/tests/test_clinical.py
+++ b/tests/test_clinical.py
@@ -1120,6 +1120,7 @@ def get_cross_validate_bed_files_test_cases():
         },
     ]
 
+
 @pytest.mark.parametrize(
     "test_cases", get_cross_validate_bed_files_test_cases(), ids=lambda x: x["name"]
 )
@@ -1127,7 +1128,9 @@ def test_that_cross_validate_bed_files_exist_returns_correct_msgs(
     clin_class, test_cases
 ):
     clin_class.ancillary_files = test_cases["test_ancillary_files"]
-    missing_files = clin_class._cross_validate_bed_files_exist(test_cases["test_clinical_df"])
+    missing_files = clin_class._cross_validate_bed_files_exist(
+        test_cases["test_clinical_df"]
+    )
     assert Counter(test_cases["expected_missing_files"]) == Counter(missing_files)
 
 

From f48037ed3b372a8e286cabe19f9712f6b7ed4bc6 Mon Sep 17 00:00:00 2001
From: rxu17 <26471741+rxu17@users.noreply.github.com>
Date: Wed, 15 Nov 2023 20:44:02 -0800
Subject: [PATCH 3/4] lint clinical

---
 genie_registry/clinical.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/genie_registry/clinical.py b/genie_registry/clinical.py
index b8fba704..d649bee7 100644
--- a/genie_registry/clinical.py
+++ b/genie_registry/clinical.py
@@ -1016,12 +1016,14 @@ def _cross_validate_bed_files_exist(self, clinicaldf) -> list:
         """Check that a bed file exist per SEQ_ASSAY_ID value in clinical file"""
         missing_files = []
         exception_params = {"ignore_case": True, "allow_underscore": True}
-        
+
         # standardize and get unique seq assay ids before searching bed files
-        seq_assay_ids = set([
-            validate.standardize_string_for_validation(sq_id, **exception_params)
-            for sq_id in clinicaldf["SEQ_ASSAY_ID"].unique()
-        ])
+        seq_assay_ids = set(
+            [
+                validate.standardize_string_for_validation(sq_id, **exception_params)
+                for sq_id in clinicaldf["SEQ_ASSAY_ID"].unique()
+            ]
+        )
 
         for seq_assay_id in seq_assay_ids:
             bed_files = validate.parse_file_info_in_nested_list(
@@ -1033,7 +1035,7 @@ def _cross_validate_bed_files_exist(self, clinicaldf) -> list:
                 missing_files.append(f"{seq_assay_id}.bed")
         return missing_files
 
-    def _cross_validate_bed_files_exist_message(self, missing_bed_files : list) -> tuple:
+    def _cross_validate_bed_files_exist_message(self, missing_bed_files: list) -> tuple:
         """Gets the warning/error messages given the missing bed files list
 
         Args:
@@ -1103,7 +1105,9 @@ def _cross_validate(self, clinicaldf) -> tuple:
             clinicaldf
         )
         missing_bed_files = self._cross_validate_bed_files_exist(clinicaldf)
-        errors_bed, warnings_bed = self._cross_validate_bed_files_exist_message(missing_bed_files)
+        errors_bed, warnings_bed = self._cross_validate_bed_files_exist_message(
+            missing_bed_files
+        )
 
         errors = errors_assay + errors_bed
         warnings = warnings_assay + warnings_bed

From f9f3e4208574cb381a3161d64722f73825eacfd3 Mon Sep 17 00:00:00 2001
From: rxu17 <26471741+rxu17@users.noreply.github.com>
Date: Wed, 15 Nov 2023 21:07:56 -0800
Subject: [PATCH 4/4] update bed file case for msg formatting purposes

---
 genie_registry/clinical.py | 2 +-
 tests/test_clinical.py     | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/genie_registry/clinical.py b/genie_registry/clinical.py
index d649bee7..e20dac75 100644
--- a/genie_registry/clinical.py
+++ b/genie_registry/clinical.py
@@ -1032,7 +1032,7 @@ def _cross_validate_bed_files_exist(self, clinicaldf) -> list:
                 **exception_params,
             )
             if not bed_files["files"]:
-                missing_files.append(f"{seq_assay_id}.bed")
+                missing_files.append(f"{seq_assay_id.upper()}.bed")
         return missing_files
 
     def _cross_validate_bed_files_exist_message(self, missing_bed_files: list) -> tuple:
diff --git a/tests/test_clinical.py b/tests/test_clinical.py
index 1c09c106..91815c97 100644
--- a/tests/test_clinical.py
+++ b/tests/test_clinical.py
@@ -1094,7 +1094,7 @@ def get_cross_validate_bed_files_test_cases():
                 [{"name": "SAGE-1-1.bed", "path": ""}],
                 [{"name": "SAGE-1.bed", "path": ""}],
             ],
-            "expected_missing_files": ["sage-1-2.bed"],
+            "expected_missing_files": ["SAGE-1-2.bed"],
         },
         {
             "name": "no_match",
@@ -1106,7 +1106,7 @@ def get_cross_validate_bed_files_test_cases():
                 [{"name": "SAGE-1-1.bed", "path": ""}],
                 [{"name": "SAGE-1.bed", "path": ""}],
             ],
-            "expected_missing_files": ["sage-1-2.bed", "sage-1-3.bed"],
+            "expected_missing_files": ["SAGE-1-2.bed", "SAGE-1-3.bed"],
         },
         {
             "name": "no_bed_files",
@@ -1116,7 +1116,7 @@ def get_cross_validate_bed_files_test_cases():
             "test_ancillary_files": [
                 [{"name": "SAGE-1.txt", "path": ""}],
             ],
-            "expected_missing_files": ["sage-1-2.bed", "sage-1-3.bed"],
+            "expected_missing_files": ["SAGE-1-2.bed", "SAGE-1-3.bed"],
         },
     ]