Skip to content

Commit

Permalink
Merge pull request #1047 from VisLab/develop
Browse files Browse the repository at this point in the history
Missing HED column in tsv now a warning
  • Loading branch information
VisLab authored Feb 7, 2025
2 parents 0d0aa61 + 2e6f603 commit 6819287
Show file tree
Hide file tree
Showing 9 changed files with 128 additions and 63 deletions.
11 changes: 6 additions & 5 deletions hed/errors/error_messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,12 @@ def val_error_sidecar_key_missing(invalid_key, category_keys):
return f"Category key '{invalid_key}' does not exist in column. Valid keys are: {category_keys}"


@hed_error(ValidationErrors.TSV_COLUMN_MISSING, actual_code=ValidationErrors.SIDECAR_KEY_MISSING,
default_severity=ErrorSeverity.WARNING)
def val_error_tsv_column_missing(invalid_key):
return f"{{HED}} is used as a key in a sidecar but does not appear as a column in the tabular file"


@hed_tag_error(ValidationErrors.HED_DEF_EXPAND_INVALID, actual_code=ValidationErrors.DEF_EXPAND_INVALID)
def val_error_bad_def_expand(tag, actual_def, found_def):
return f"A data-recording's Def-expand tag does not match the given definition." \
Expand Down Expand Up @@ -314,11 +320,6 @@ def sidecar_hed_used():
return "'HED' is a reserved name and cannot be used as a sidecar except in expected places."


@hed_error(SidecarErrors.SIDECAR_HED_USED_COLUMN, actual_code=ValidationErrors.SIDECAR_INVALID)
def sidecar_hed_used_column():
return "'HED' is a reserved name and cannot be used as a sidecar column name"


@hed_error(SidecarErrors.SIDECAR_NA_USED, actual_code=ValidationErrors.SIDECAR_INVALID)
def sidecar_na_used(column_name):
return f"Invalid category key 'n/a' found in column {column_name}."
Expand Down
5 changes: 3 additions & 2 deletions hed/errors/error_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ class ValidationErrors:
REQUIRED_TAG_MISSING = 'REQUIRED_TAG_MISSING'
SIDECAR_INVALID = 'SIDECAR_INVALID'
SIDECAR_KEY_MISSING = 'SIDECAR_KEY_MISSING'
HED_COLUMN_MISSING = 'HED_COLUMN_MISSING'
STYLE_WARNING = "STYLE_WARNING"
TAG_EMPTY = 'TAG_EMPTY'
TAG_EXPRESSION_REPEATED = 'TAG_EXPRESSION_REPEATED'
Expand Down Expand Up @@ -96,6 +97,7 @@ class ValidationErrors:
HED_PLACEHOLDER_OUT_OF_CONTEXT = 'HED_PLACEHOLDER_OUT_OF_CONTEXT'
CURLY_BRACE_UNSUPPORTED_HERE = 'CURLY_BRACE_UNSUPPORTED_HERE'
ONSETS_UNORDERED = "ONSETS_UNORDERED"
TSV_COLUMN_MISSING="TSV_COLUMN_MISSING"


class SidecarErrors:
Expand All @@ -105,9 +107,8 @@ class SidecarErrors:
INVALID_POUND_SIGNS_VALUE = 'invalidNumberPoundSigns'
INVALID_POUND_SIGNS_CATEGORY = 'tooManyPoundSigns'
UNKNOWN_COLUMN_TYPE = 'sidecarUnknownColumn'
SIDECAR_HED_USED_COLUMN = 'sidecar_hed_used_column'
SIDECAR_HED_USED = 'SIDECAR_HED_USED'
SIDECAR_NA_USED = 'SIDECAR_NA_USED'
SIDECAR_HED_USED = 'sidecar_hed_used'
SIDECAR_BRACES_INVALID = "SIDECAR_BRACES_INVALID"


Expand Down
1 change: 0 additions & 1 deletion hed/models/column_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ def __init__(self, sidecar=None, tag_columns=None, column_prefix_dictionary=None
# Maps column number to column_entry. This is what's actually used by most code.
self._final_column_map = {}
self._no_mapping_info = True

self._column_map = {}
self._reverse_column_map = {}
self._warn_on_missing_column = warn_on_missing_column
Expand Down
13 changes: 9 additions & 4 deletions hed/models/df_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,20 +180,25 @@ def _handle_curly_braces_refs(df, refs, column_names):
modified_df(pd.DataFrame): The modified dataframe with refs replaced
"""
# Filter out columns and refs that don't exist.
refs = [ref for ref in refs if ref in column_names]
remaining_columns = [column for column in column_names if column not in refs]
refs_new = [ref for ref in refs if ref in column_names]
remaining_columns = [column for column in column_names if column not in refs_new]

new_df = df.copy()
# Replace references in the columns we are saving out.
saved_columns = new_df[refs]
saved_columns = new_df[refs_new]
for column_name in remaining_columns:
for replacing_name in refs:
for replacing_name in refs_new:
# If the data has no n/a values, this version is MUCH faster.
# column_name_brackets = f"{{{replacing_name}}}"
# df[column_name] = pd.Series(x.replace(column_name_brackets, y) for x, y
# in zip(df[column_name], saved_columns[replacing_name]))
new_df[column_name] = pd.Series(replace_ref(x, f"{{{replacing_name}}}", y) for x, y
in zip(new_df[column_name], saved_columns[replacing_name]))
# Handle the special case of {HED} when the tsv file has no {HED} column
if 'HED' in refs and 'HED' not in column_names:
for column_name in remaining_columns:
new_df[column_name] =\
pd.Series(replace_ref(x, "{HED}", "n/a") for x in new_df[column_name])
new_df = new_df[remaining_columns]

return new_df
Expand Down
95 changes: 48 additions & 47 deletions hed/models/spreadsheet_input.py
Original file line number Diff line number Diff line change
@@ -1,47 +1,48 @@
""" A spreadsheet of HED tags. """
from hed.models.column_mapper import ColumnMapper
from hed.models.base_input import BaseInput


class SpreadsheetInput(BaseInput):
""" A spreadsheet of HED tags. """

def __init__(self, file=None, file_type=None, worksheet_name=None, tag_columns=None,
has_column_names=True, column_prefix_dictionary=None,
name=None):
"""Constructor for the SpreadsheetInput class.
Parameters:
file (str or file like): An xlsx/tsv file to open or a File object.
file_type (str or None): ".xlsx" for Excel, ".tsv" or ".txt" for tsv. data.
worksheet_name (str or None): The name of the Excel workbook worksheet that contains the HED tags.
Not applicable to tsv files. If omitted for Excel, the first worksheet is assumed.
tag_columns (list): A list of ints or strs containing the columns that contain the HED tags.
If ints then column numbers with [1] indicating only the second column has tags.
has_column_names (bool): True if file has column names. Validation will skip over the first row.
first line of the file if the spreadsheet as column names.
column_prefix_dictionary (dict or None): Dictionary with keys that are column numbers/names and
values are HED tag prefixes to prepend to the tags in that column before processing.
Notes:
- If file is a string, file_type is derived from file and this parameter is ignored.
- column_prefix_dictionary may be deprecated/renamed. These are no longer prefixes,
but rather converted to value columns.
e.g. {"key": "Description", 1: "Label/"} will turn into value columns as
{"key": "Description/#", 1: "Label/#"}
It will be a validation issue if column 1 is called "key" in the above example.
This means it no longer accepts anything but the value portion only in the columns.
:raises HedFileError:
- The file is blank.
- An invalid dataframe was passed with size 0.
- An invalid extension was provided.
- A duplicate or empty column name appears.
- Cannot open the indicated file.
- The specified worksheet name does not exist.
"""

new_mapper = ColumnMapper(tag_columns=tag_columns, column_prefix_dictionary=column_prefix_dictionary,
warn_on_missing_column=False)

super().__init__(file, file_type, worksheet_name, has_column_names, new_mapper, name=name)
""" A spreadsheet of HED tags. """
from hed.models.column_mapper import ColumnMapper
from hed.models.base_input import BaseInput


class SpreadsheetInput(BaseInput):
""" A spreadsheet of HED tags. """

def __init__(self, file=None, file_type=None, worksheet_name=None, tag_columns=None,
has_column_names=True, column_prefix_dictionary=None,
name=None):
"""Constructor for the SpreadsheetInput class.
Parameters:
file (str or file like): An xlsx/tsv file to open or a File object.
file_type (str or None): ".xlsx" for Excel, ".tsv" or ".txt" for tsv. data.
worksheet_name (str or None): The name of the Excel workbook worksheet that contains the HED tags.
Not applicable to tsv files. If omitted for Excel, the first worksheet is assumed.
tag_columns (list): A list of ints or strs containing the columns that contain the HED tags.
If ints then column numbers with [1] indicating only the second column has tags.
has_column_names (bool): True if file has column names. Validation will skip over the first row.
first line of the file if the spreadsheet as column names.
column_prefix_dictionary (dict or None): Dictionary with keys that are column numbers/names and
values are HED tag prefixes to prepend to the tags in that column before processing.
Notes:
- If file is a string, file_type is derived from file and this parameter is ignored.
- column_prefix_dictionary may be deprecated/renamed. These are no longer prefixes,
but rather converted to value columns.
e.g. {"key": "Description", 1: "Label/"} will turn into value columns as
{"key": "Description/#", 1: "Label/#"}
It will be a validation issue if column 1 is called "key" in the above example.
This means it no longer accepts anything but the value portion only in the columns.
:raises HedFileError:
- The file is blank.
- An invalid dataframe was passed with size 0.
- An invalid extension was provided.
- A duplicate or empty column name appears.
- Cannot open the indicated file.
- The specified worksheet name does not exist.
"""

self.tag_columns = tag_columns
new_mapper = ColumnMapper(tag_columns=tag_columns, column_prefix_dictionary=column_prefix_dictionary,
warn_on_missing_column=False)

super().__init__(file, file_type, worksheet_name, has_column_names, new_mapper, name=name)
2 changes: 1 addition & 1 deletion hed/validator/sidecar_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ def _validate_column_structure(self, column_name, dict_for_entry, error_handler)
"""
val_issues = []
if column_name in self.reserved_column_names:
val_issues += error_handler.format_error_with_context(SidecarErrors.SIDECAR_HED_USED_COLUMN)
val_issues += error_handler.format_error_with_context(SidecarErrors.SIDECAR_HED_USED)
return val_issues

column_type = ColumnMetadata._detect_column_type(dict_for_entry=dict_for_entry, basic_validation=False)
Expand Down
3 changes: 1 addition & 2 deletions hed/validator/spreadsheet_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,6 @@ def _validate_column_structure(self, base_input, error_handler, row_adj):
columns = base_input.columns
for ref in column_refs:
if ref not in columns:
issues += error_handler.format_error_with_context(ColumnErrors.INVALID_COLUMN_REF,
bad_ref=ref)
issues += error_handler.format_error_with_context(ValidationErrors.TSV_COLUMN_MISSING, invalid_key=ref)

return issues
1 change: 0 additions & 1 deletion spec_tests/test_errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
runAll = True
runOnly = {}


class MyTestCase(unittest.TestCase):
@classmethod
def setUpClass(cls):
Expand Down
60 changes: 60 additions & 0 deletions tests/validator/test_spreadsheet_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from hed.validator import SpreadsheetValidator
from hed import TabularInput, SpreadsheetInput, Sidecar
from hed.errors.error_types import ValidationErrors
from hed.errors.error_reporter import ErrorHandler


class TestSpreadsheetValidation(unittest.TestCase):
Expand Down Expand Up @@ -96,6 +97,65 @@ def test_invalid_onset_invalid_column(self):
self.assertEqual(len(issues), 1)
self.assertEqual(issues[0]['code'], ValidationErrors.TEMPORAL_TAG_ERROR)

def test_empty(self):
spreadsheet = SpreadsheetInput(file=io.StringIO("BadFile"), worksheet_name=None,
file_type=".tsv", tag_columns=[3],
has_column_names=True, column_prefix_dictionary=None,
name='spreadsheets.tsv')
error_handler = ErrorHandler(check_for_warnings=True)
issues = self.validator.validate(spreadsheet, error_handler=error_handler)
self.assertEqual(len(issues), 0)

def test_tabular_with_hed(self):
sidecar_hed_json = '''
{
"event_code": {
"HED": {
"face": "{HED}",
"ball": "Red"
}
}
}
'''
sidecar = Sidecar(io.StringIO(sidecar_hed_json))
issues = sidecar.validate(self.hed_schema)
self.assertEqual(len(issues), 0)
data = [
["onset", "duration", "event_code", "HED"],
[4.5, 0, "face", "Black"],
[5.0, 0, "n/a", ""]
]
df = pd.DataFrame(data[1:], columns=data[0])
my_tab = TabularInput(df, sidecar=sidecar, name='test_no_hed')
error_handler = ErrorHandler(check_for_warnings=False)
issues = self.validator.validate(my_tab, error_handler=error_handler)
self.assertEqual(len(issues), 0)

def test_tabular_no_hed(self):
sidecar_hed_json = '''
{
"event_code": {
"HED": {
"face": "{HED}",
"ball": "Red"
}
}
}
'''
sidecar = Sidecar(io.StringIO(sidecar_hed_json))
issues = sidecar.validate(self.hed_schema)
data = [
["onset", "duration", "event_code"],
[4.5, 0, "face"],
[5.0, 0, "ball"]
]
df = pd.DataFrame(data[1:], columns=data[0])
my_tab = TabularInput(df, sidecar=sidecar, name='test_no_hed')
error_handler = ErrorHandler(check_for_warnings=False)
issues = self.validator.validate(my_tab, error_handler=error_handler)
print(issues)
self.assertEqual(len(issues), 0)

def test_onset_na(self):
# Test with no sidecar
def_dict = "(Definition/Def1, (Event))"
Expand Down

0 comments on commit 6819287

Please sign in to comment.