Merge pull request #1047 from VisLab/develop

Missing HED column in tsv now a warning
hed-standard · Feb 7, 2025 · 6819287 · 6819287
2 parents 0d0aa61 + 2e6f603
commit 6819287
Show file tree

Hide file tree

Showing 9 changed files with 128 additions and 63 deletions.
diff --git a/hed/errors/error_messages.py b/hed/errors/error_messages.py
@@ -215,6 +215,12 @@ def val_error_sidecar_key_missing(invalid_key, category_keys):
     return f"Category key '{invalid_key}' does not exist in column.  Valid keys are: {category_keys}"
 
 
+@hed_error(ValidationErrors.TSV_COLUMN_MISSING, actual_code=ValidationErrors.SIDECAR_KEY_MISSING,
+           default_severity=ErrorSeverity.WARNING)
+def val_error_tsv_column_missing(invalid_key):
+    return f"{{HED}} is used as a key in a sidecar but does not appear as a column in the tabular file"
+
+
 @hed_tag_error(ValidationErrors.HED_DEF_EXPAND_INVALID, actual_code=ValidationErrors.DEF_EXPAND_INVALID)
 def val_error_bad_def_expand(tag, actual_def, found_def):
     return f"A data-recording's Def-expand tag does not match the given definition." \
@@ -314,11 +320,6 @@ def sidecar_hed_used():
     return "'HED' is a reserved name and cannot be used as a sidecar except in expected places."
 
 
-@hed_error(SidecarErrors.SIDECAR_HED_USED_COLUMN, actual_code=ValidationErrors.SIDECAR_INVALID)
-def sidecar_hed_used_column():
-    return "'HED' is a reserved name and cannot be used as a sidecar column name"
-
-
 @hed_error(SidecarErrors.SIDECAR_NA_USED, actual_code=ValidationErrors.SIDECAR_INVALID)
 def sidecar_na_used(column_name):
     return f"Invalid category key 'n/a' found in column {column_name}."

diff --git a/hed/errors/error_types.py b/hed/errors/error_types.py
@@ -37,6 +37,7 @@ class ValidationErrors:
     REQUIRED_TAG_MISSING = 'REQUIRED_TAG_MISSING'
     SIDECAR_INVALID = 'SIDECAR_INVALID'
     SIDECAR_KEY_MISSING = 'SIDECAR_KEY_MISSING'
+    HED_COLUMN_MISSING = 'HED_COLUMN_MISSING'
     STYLE_WARNING = "STYLE_WARNING"
     TAG_EMPTY = 'TAG_EMPTY'
     TAG_EXPRESSION_REPEATED = 'TAG_EXPRESSION_REPEATED'
@@ -96,6 +97,7 @@ class ValidationErrors:
     HED_PLACEHOLDER_OUT_OF_CONTEXT = 'HED_PLACEHOLDER_OUT_OF_CONTEXT'
     CURLY_BRACE_UNSUPPORTED_HERE = 'CURLY_BRACE_UNSUPPORTED_HERE'
     ONSETS_UNORDERED = "ONSETS_UNORDERED"
+    TSV_COLUMN_MISSING="TSV_COLUMN_MISSING"
 
 
 class SidecarErrors:
@@ -105,9 +107,8 @@ class SidecarErrors:
     INVALID_POUND_SIGNS_VALUE = 'invalidNumberPoundSigns'
     INVALID_POUND_SIGNS_CATEGORY = 'tooManyPoundSigns'
     UNKNOWN_COLUMN_TYPE = 'sidecarUnknownColumn'
-    SIDECAR_HED_USED_COLUMN = 'sidecar_hed_used_column'
+    SIDECAR_HED_USED = 'SIDECAR_HED_USED'
     SIDECAR_NA_USED = 'SIDECAR_NA_USED'
-    SIDECAR_HED_USED = 'sidecar_hed_used'
     SIDECAR_BRACES_INVALID = "SIDECAR_BRACES_INVALID"
 
 

diff --git a/hed/models/column_mapper.py b/hed/models/column_mapper.py
@@ -49,7 +49,6 @@ def __init__(self, sidecar=None, tag_columns=None, column_prefix_dictionary=None
         # Maps column number to column_entry.  This is what's actually used by most code.
         self._final_column_map = {}
         self._no_mapping_info = True
-
         self._column_map = {}
         self._reverse_column_map = {}
         self._warn_on_missing_column = warn_on_missing_column

diff --git a/hed/models/df_util.py b/hed/models/df_util.py
@@ -180,20 +180,25 @@ def _handle_curly_braces_refs(df, refs, column_names):
         modified_df(pd.DataFrame): The modified dataframe with refs replaced
     """
     # Filter out columns and refs that don't exist.
-    refs = [ref for ref in refs if ref in column_names]
-    remaining_columns = [column for column in column_names if column not in refs]
+    refs_new = [ref for ref in refs if ref in column_names]
+    remaining_columns = [column for column in column_names if column not in refs_new]
 
     new_df = df.copy()
     # Replace references in the columns we are saving out.
-    saved_columns = new_df[refs]
+    saved_columns = new_df[refs_new]
     for column_name in remaining_columns:
-        for replacing_name in refs:
+        for replacing_name in refs_new:
             # If the data has no n/a values, this version is MUCH faster.
             # column_name_brackets = f"{{{replacing_name}}}"
             # df[column_name] = pd.Series(x.replace(column_name_brackets, y) for x, y
             #                             in zip(df[column_name], saved_columns[replacing_name]))
             new_df[column_name] = pd.Series(replace_ref(x, f"{{{replacing_name}}}", y) for x, y
                                             in zip(new_df[column_name], saved_columns[replacing_name]))
+    # Handle the special case of {HED} when the tsv file has no {HED} column
+    if 'HED' in refs and 'HED' not in column_names:
+        for column_name in remaining_columns:
+            new_df[column_name] =\
+                pd.Series(replace_ref(x, "{HED}", "n/a") for x in new_df[column_name])
     new_df = new_df[remaining_columns]
 
     return new_df

diff --git a/hed/models/spreadsheet_input.py b/hed/models/spreadsheet_input.py
@@ -1,47 +1,48 @@
-""" A spreadsheet of HED tags. """
-from hed.models.column_mapper import ColumnMapper
-from hed.models.base_input import BaseInput
-
-
-class SpreadsheetInput(BaseInput):
-    """ A spreadsheet of HED tags. """
-
-    def __init__(self, file=None, file_type=None, worksheet_name=None, tag_columns=None,
-                 has_column_names=True, column_prefix_dictionary=None,
-                 name=None):
-        """Constructor for the SpreadsheetInput class.
-
-        Parameters:
-            file (str or file like): An xlsx/tsv file to open or a File object.
-            file_type (str or None): ".xlsx" for Excel, ".tsv" or ".txt" for tsv. data.
-            worksheet_name (str or None): The name of the Excel workbook worksheet that contains the HED tags.
-                Not applicable to tsv files. If omitted for Excel, the first worksheet is assumed.
-            tag_columns (list): A list of ints or strs containing the columns that contain the HED tags.
-                If ints then column numbers with [1] indicating only the second column has tags.
-            has_column_names (bool): True if file has column names. Validation will skip over the first row.
-                first line of the file if the spreadsheet as column names.
-            column_prefix_dictionary (dict or None): Dictionary with keys that are column numbers/names and
-                values are HED tag prefixes to prepend to the tags in that column before processing.
-
-        Notes:
-            - If file is a string, file_type is derived from file and this parameter is ignored.
-            - column_prefix_dictionary may be deprecated/renamed.  These are no longer prefixes,
-              but rather converted to value columns.
-              e.g. {"key": "Description", 1: "Label/"} will turn into value columns as
-              {"key": "Description/#", 1: "Label/#"}
-              It will be a validation issue if column 1 is called "key" in the above example.
-              This means it no longer accepts anything but the value portion only in the columns.
-
-        :raises HedFileError:
-            - The file is blank.
-            - An invalid dataframe was passed with size 0.
-            - An invalid extension was provided.
-            - A duplicate or empty column name appears.
-            - Cannot open the indicated file.
-            - The specified worksheet name does not exist.
-        """
-
-        new_mapper = ColumnMapper(tag_columns=tag_columns, column_prefix_dictionary=column_prefix_dictionary,
-                                  warn_on_missing_column=False)
-
-        super().__init__(file, file_type, worksheet_name, has_column_names, new_mapper, name=name)
+""" A spreadsheet of HED tags. """
+from hed.models.column_mapper import ColumnMapper
+from hed.models.base_input import BaseInput
+
+
+class SpreadsheetInput(BaseInput):
+    """ A spreadsheet of HED tags. """
+
+    def __init__(self, file=None, file_type=None, worksheet_name=None, tag_columns=None,
+                 has_column_names=True, column_prefix_dictionary=None,
+                 name=None):
+        """Constructor for the SpreadsheetInput class.
+
+        Parameters:
+            file (str or file like): An xlsx/tsv file to open or a File object.
+            file_type (str or None): ".xlsx" for Excel, ".tsv" or ".txt" for tsv. data.
+            worksheet_name (str or None): The name of the Excel workbook worksheet that contains the HED tags.
+                Not applicable to tsv files. If omitted for Excel, the first worksheet is assumed.
+            tag_columns (list): A list of ints or strs containing the columns that contain the HED tags.
+                If ints then column numbers with [1] indicating only the second column has tags.
+            has_column_names (bool): True if file has column names. Validation will skip over the first row.
+                first line of the file if the spreadsheet as column names.
+            column_prefix_dictionary (dict or None): Dictionary with keys that are column numbers/names and
+                values are HED tag prefixes to prepend to the tags in that column before processing.
+
+        Notes:
+            - If file is a string, file_type is derived from file and this parameter is ignored.
+            - column_prefix_dictionary may be deprecated/renamed.  These are no longer prefixes,
+              but rather converted to value columns.
+              e.g. {"key": "Description", 1: "Label/"} will turn into value columns as
+              {"key": "Description/#", 1: "Label/#"}
+              It will be a validation issue if column 1 is called "key" in the above example.
+              This means it no longer accepts anything but the value portion only in the columns.
+
+        :raises HedFileError:
+            - The file is blank.
+            - An invalid dataframe was passed with size 0.
+            - An invalid extension was provided.
+            - A duplicate or empty column name appears.
+            - Cannot open the indicated file.
+            - The specified worksheet name does not exist.
+        """
+
+        self.tag_columns = tag_columns
+        new_mapper = ColumnMapper(tag_columns=tag_columns, column_prefix_dictionary=column_prefix_dictionary,
+                                  warn_on_missing_column=False)
+
+        super().__init__(file, file_type, worksheet_name, has_column_names, new_mapper, name=name)
diff --git a/hed/validator/sidecar_validator.py b/hed/validator/sidecar_validator.py
@@ -244,7 +244,7 @@ def _validate_column_structure(self, column_name, dict_for_entry, error_handler)
         """
         val_issues = []
         if column_name in self.reserved_column_names:
-            val_issues += error_handler.format_error_with_context(SidecarErrors.SIDECAR_HED_USED_COLUMN)
+            val_issues += error_handler.format_error_with_context(SidecarErrors.SIDECAR_HED_USED)
             return val_issues
 
         column_type = ColumnMetadata._detect_column_type(dict_for_entry=dict_for_entry, basic_validation=False)

diff --git a/hed/validator/spreadsheet_validator.py b/hed/validator/spreadsheet_validator.py
@@ -236,7 +236,6 @@ def _validate_column_structure(self, base_input, error_handler, row_adj):
         columns = base_input.columns
         for ref in column_refs:
             if ref not in columns:
-                issues += error_handler.format_error_with_context(ColumnErrors.INVALID_COLUMN_REF,
-                                                                  bad_ref=ref)
+                issues += error_handler.format_error_with_context(ValidationErrors.TSV_COLUMN_MISSING, invalid_key=ref)
 
         return issues
diff --git a/spec_tests/test_errors.py b/spec_tests/test_errors.py
@@ -24,7 +24,6 @@
 runAll = True
 runOnly = {}
 
-
 class MyTestCase(unittest.TestCase):
     @classmethod
     def setUpClass(cls):

diff --git a/tests/validator/test_spreadsheet_validator.py b/tests/validator/test_spreadsheet_validator.py
@@ -9,6 +9,7 @@
 from hed.validator import SpreadsheetValidator
 from hed import TabularInput, SpreadsheetInput, Sidecar
 from hed.errors.error_types import ValidationErrors
+from hed.errors.error_reporter import ErrorHandler
 
 
 class TestSpreadsheetValidation(unittest.TestCase):
@@ -96,6 +97,65 @@ def test_invalid_onset_invalid_column(self):
         self.assertEqual(len(issues), 1)
         self.assertEqual(issues[0]['code'], ValidationErrors.TEMPORAL_TAG_ERROR)
 
+    def test_empty(self):
+        spreadsheet = SpreadsheetInput(file=io.StringIO("BadFile"), worksheet_name=None,
+                                       file_type=".tsv", tag_columns=[3],
+                                       has_column_names=True, column_prefix_dictionary=None,
+                                       name='spreadsheets.tsv')
+        error_handler = ErrorHandler(check_for_warnings=True)
+        issues = self.validator.validate(spreadsheet, error_handler=error_handler)
+        self.assertEqual(len(issues), 0)
+
+    def test_tabular_with_hed(self):
+        sidecar_hed_json = '''
+           {
+               "event_code": {
+                   "HED": {
+                        "face": "{HED}",
+                        "ball": "Red"
+                   }
+               }
+           }
+           '''
+        sidecar = Sidecar(io.StringIO(sidecar_hed_json))
+        issues = sidecar.validate(self.hed_schema)
+        self.assertEqual(len(issues), 0)
+        data = [
+            ["onset", "duration", "event_code", "HED"],
+            [4.5, 0, "face", "Black"],
+            [5.0, 0, "n/a", ""]
+        ]
+        df = pd.DataFrame(data[1:], columns=data[0])
+        my_tab = TabularInput(df, sidecar=sidecar, name='test_no_hed')
+        error_handler = ErrorHandler(check_for_warnings=False)
+        issues = self.validator.validate(my_tab, error_handler=error_handler)
+        self.assertEqual(len(issues), 0)
+
+    def test_tabular_no_hed(self):
+        sidecar_hed_json = '''
+        {
+            "event_code": {
+                "HED": {
+                     "face": "{HED}",
+                     "ball": "Red"
+                }
+            }
+        }
+        '''
+        sidecar = Sidecar(io.StringIO(sidecar_hed_json))
+        issues = sidecar.validate(self.hed_schema)
+        data = [
+            ["onset", "duration", "event_code"],
+            [4.5, 0, "face"],
+            [5.0, 0, "ball"]
+        ]
+        df = pd.DataFrame(data[1:], columns=data[0])
+        my_tab = TabularInput(df, sidecar=sidecar, name='test_no_hed')
+        error_handler = ErrorHandler(check_for_warnings=False)
+        issues = self.validator.validate(my_tab, error_handler=error_handler)
+        print(issues)
+        self.assertEqual(len(issues), 0)
+
     def test_onset_na(self):
         # Test with no sidecar
         def_dict = "(Definition/Def1, (Event))"