From 95cdbe9494e7150ff43f07143efe2cdbd5d2cc47 Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Thu, 16 Jan 2025 13:44:55 -0500 Subject: [PATCH 01/12] Start adding schema to CuBIDS. --- cubids/config.py | 30 ++++++++++++++++++++++++++++++ cubids/cubids.py | 8 +++++++- cubids/data/schema.json | 1 + 3 files changed, 38 insertions(+), 1 deletion(-) create mode 100644 cubids/data/schema.json diff --git a/cubids/config.py b/cubids/config.py index a863d4eab..421299581 100644 --- a/cubids/config.py +++ b/cubids/config.py @@ -4,6 +4,7 @@ import importlib.resources import yaml + def load_config(config_file): """Load a YAML file containing a configuration for param groups. @@ -24,3 +25,32 @@ def load_config(config_file): config = yaml.safe_load(f) return config + + +def load_schema(schema_file): + """Load a JSON file containing the BIDS schema. + + Parameters + ---------- + schema_file : str or pathlib.Path, optional + The path to the schema file. If None, the default schema file is used. + + Returns + ------- + dict + The schema loaded from the YAML file. + """ + import json + + if schema_file is None: + schema_file = Path(importlib.resources.files("cubids") / "data/schema.json") + + with schema_file.open() as f: + schema = json.load(f) + + print( + f"Loading BIDS schema version: {schema['schema_version']}. " + f"BIDS version: {schema['bids_version']}" + ) + + return schema diff --git a/cubids/cubids.py b/cubids/cubids.py index 158e97d05..ee135b4a6 100644 --- a/cubids/cubids.py +++ b/cubids/cubids.py @@ -21,7 +21,7 @@ from sklearn.cluster import AgglomerativeClustering from tqdm import tqdm -from cubids.config import load_config +from cubids.config import load_config, load_schema from cubids.constants import ID_VARS, NON_KEY_ENTITIES from cubids.metadata_merge import check_merging_operations, group_by_acquisition_sets @@ -47,6 +47,9 @@ class CuBIDS(object): force_unlock : :obj:`bool`, optional If True, force unlock all files in the BIDS dataset. Default is False. + schema_json : :obj:`str`, optional + Path to a BIDS schema JSON file. + Default is None, in which case the default schema in CuBIDS is used. Attributes ---------- @@ -93,6 +96,7 @@ def __init__( acq_group_level="subject", grouping_config=None, force_unlock=False, + schema_json=None, ): self.path = os.path.abspath(data_root) self._layout = None @@ -110,6 +114,8 @@ def __init__( self.cubids_code_dir = Path(self.path + "/code/CuBIDS").is_dir() self.data_dict = {} # data dictionary for TSV outputs self.use_datalad = use_datalad # True if flag set, False if flag unset + self.schema = load_schema(schema_json) + if self.use_datalad: self.init_datalad() diff --git a/cubids/data/schema.json b/cubids/data/schema.json new file mode 100644 index 000000000..e5dc777ec --- /dev/null +++ b/cubids/data/schema.json @@ -0,0 +1 @@ +{"schema_version": "0.11.3", "bids_version": "1.10.0", "meta": {"associations": {"events": {"selectors": ["task in entities", "extension != '.json'"], "target": {"suffix": "events", "extension": ".tsv"}, "inherit": true}, "aslcontext": {"selectors": ["suffix == 'asl'", "match(extension, '\\.nii(\\.gz)?$')"], "target": {"suffix": "aslcontext", "extension": ".tsv"}, "inherit": true}, "m0scan": {"selectors": ["suffix == 'asl'", "match(extension, '\\.nii(\\.gz)?$')"], "target": {"suffix": "m0scan", "extension": [".nii", ".nii.gz"]}, "inherit": false}, "magnitude": {"selectors": ["suffix == 'fieldmap'", "match(extension, '\\.nii(\\.gz)?$')"], "target": {"suffix": "magnitude", "extension": [".nii", ".nii.gz"]}, "inherit": false}, "magnitude1": {"selectors": ["match(suffix, 'phase(diff|1)$')", "match(extension, '\\.nii(\\.gz)?$')"], "target": {"suffix": "magnitude1", "extension": [".nii", ".nii.gz"]}, "inherit": false}, "bval": {"selectors": ["intersects([suffix], ['dwi', 'epi'])", "match(extension, '\\.nii(\\.gz)?$')"], "target": {"extension": ".bval"}, "inherit": true}, "bvec": {"selectors": ["intersects([suffix], ['dwi', 'epi'])", "match(extension, '\\.nii(\\.gz)?$')"], "target": {"extension": ".bvec"}, "inherit": true}, "channels": {"selectors": ["intersects([suffix], ['eeg', 'ieeg', 'meg', 'nirs', 'motion', 'optodes'])", "extension != '.json'"], "target": {"suffix": "channels", "extension": ".tsv"}, "inherit": true}, "coordsystem": {"selectors": ["intersects([suffix], ['eeg', 'ieeg', 'meg', 'nirs', 'motion', 'electrodes', 'optodes'])", "extension != '.json'"], "target": {"suffix": "coordsystem", "extension": ".json"}, "inherit": true}}, "context": {"type": "object", "required": ["schema", "dataset", "path", "size", "sidecar", "associations"], "additionalProperties": false, "properties": {"schema": {"description": "The BIDS specification schema", "type": "object"}, "dataset": {"description": "Properties and contents of the entire dataset", "type": "object", "required": ["dataset_description", "tree", "ignored", "datatypes", "modalities", "subjects"], "additionalProperties": false, "properties": {"dataset_description": {"description": "Contents of /dataset_description.json", "type": "object"}, "tree": {"description": "Tree view of all files in dataset", "type": "object"}, "ignored": {"description": "Set of ignored files", "type": "array", "items": {"type": "string"}}, "datatypes": {"description": "Data types present in the dataset", "type": "array", "items": {"type": "string"}}, "modalities": {"description": "Modalities present in the dataset", "type": "array", "items": {"type": "string"}}, "subjects": {"description": "Collections of subjects in dataset", "type": "object", "required": ["sub_dirs"], "additionalProperties": false, "properties": {"sub_dirs": {"description": "Subjects as determined by sub-* directories", "type": "array", "items": {"type": "string"}}, "participant_id": {"description": "The participant_id column of participants.tsv", "type": "array", "items": {"type": "string"}}, "phenotype": {"description": "The union of participant_id columns in phenotype files", "type": "array", "items": {"type": "string"}}}}}}, "subject": {"description": "Properties and contents of the current subject", "type": "object", "required": ["sessions"], "additionalProperties": false, "properties": {"sessions": {"description": "Collections of sessions in subject", "type": "object", "required": ["ses_dirs"], "additionalProperties": false, "properties": {"ses_dirs": {"description": "Sessions as determined by ses-* directories", "type": "array", "items": {"type": "string"}}, "session_id": {"description": "The session_id column of sessions.tsv", "type": "array", "items": {"type": "string"}}, "phenotype": {"description": "The union of session_id columns in phenotype files", "type": "array", "items": {"type": "string"}}}}}}, "path": {"description": "Path of the current file", "type": "string"}, "size": {"description": "Length of the current file in bytes", "type": "integer"}, "entities": {"description": "Entities parsed from the current filename", "type": "object", "additionalProperties": {"type": "string"}}, "datatype": {"description": "Datatype of current file, for examples, anat", "type": "string"}, "suffix": {"description": "Suffix of current file", "type": "string"}, "extension": {"description": "Extension of current file including initial dot", "type": "string"}, "modality": {"description": "Modality of current file, for examples, MRI", "type": "string"}, "sidecar": {"description": "Sidecar metadata constructed via the inheritance principle", "type": "object"}, "associations": {"description": "Associated files, indexed by suffix, selected according to the inheritance principle\n", "type": "object", "additionalProperties": false, "properties": {"events": {"description": "Events file", "type": "object", "required": ["path"], "additionalProperties": false, "properties": {"path": {"description": "Path to associated events file", "type": "string"}, "onset": {"description": "Contents of the onset column", "type": "array", "items": {"type": "string"}}}}, "aslcontext": {"description": "ASL context file", "type": "object", "required": ["path", "n_rows"], "additionalProperties": false, "properties": {"path": {"description": "Path to associated aslcontext file", "type": "string"}, "n_rows": {"description": "Number of rows in aslcontext.tsv", "type": "integer"}, "volume_type": {"description": "Contents of the volume_type column", "type": "array", "items": {"type": "string"}}}}, "m0scan": {"description": "M0 scan file", "type": "object", "required": ["path"], "additionalProperties": false, "properties": {"path": {"description": "Path to associated M0 scan file", "type": "string"}}}, "magnitude": {"description": "Magnitude image file", "type": "object", "required": ["path"], "additionalProperties": false, "properties": {"path": {"description": "Path to associated magnitude file", "type": "string"}}}, "magnitude1": {"description": "Magnitude1 image file", "type": "object", "required": ["path"], "additionalProperties": false, "properties": {"path": {"description": "Path to associated magnitude1 file", "type": "string"}}}, "bval": {"description": "B value file", "type": "object", "required": ["path", "n_cols", "n_rows", "values"], "additionalProperties": false, "properties": {"path": {"description": "Path to associated bval file", "type": "string"}, "n_cols": {"description": "Number of columns in bval file", "type": "integer"}, "n_rows": {"description": "Number of rows in bval file", "type": "integer"}, "values": {"description": "B-values contained in bval file", "type": "array", "items": {"type": "number"}}}}, "bvec": {"description": "B vector file", "type": "object", "required": ["path", "n_cols", "n_rows"], "additionalProperties": false, "properties": {"path": {"description": "Path to associated bvec file", "type": "string"}, "n_cols": {"description": "Number of columns in bvec file", "type": "integer"}, "n_rows": {"description": "Number of rows in bvec file", "type": "integer"}}}, "channels": {"description": "Channels file", "type": "object", "required": ["path"], "additionalProperties": false, "properties": {"path": {"description": "Path to associated channels file", "type": "string"}, "type": {"description": "Contents of the type column", "type": "array", "items": {"type": "string"}}, "short_channel": {"description": "Contents of the short_channel column", "type": "array", "items": {"type": "string"}}, "sampling_frequency": {"description": "Contents of the sampling_frequency column", "type": "array", "items": {"type": "string"}}}}, "coordsystem": {"description": "Coordinate system file", "type": "object", "required": ["path"], "additionalProperties": false, "properties": {"path": {"description": "Path to associated coordsystem file", "type": "string"}}}}}, "columns": {"description": "TSV columns, indexed by column header, values are arrays with column contents", "type": "object", "additionalProperties": {"type": "array", "items": {"type": "string"}}}, "json": {"description": "Contents of the current JSON file", "type": "object"}, "gzip": {"description": "Parsed contents of gzip header", "type": "object", "required": ["timestamp"], "additionalProperties": false, "properties": {"timestamp": {"description": "Modification time, unix timestamp", "type": "number"}, "filename": {"description": "Filename", "type": "string"}, "comment": {"description": "Comment", "type": "string"}}}, "nifti_header": {"name": "NIfTI Header", "description": "Parsed contents of NIfTI header referenced elsewhere in schema.", "type": "object", "required": ["dim_info", "dim", "pixdim", "shape", "voxel_sizes", "xyzt_units", "qform_code", "sform_code"], "additionalProperties": false, "properties": {"dim_info": {"name": "Dimension Information", "description": "Metadata about dimensions data.", "type": "object", "required": ["freq", "phase", "slice"], "additionalProperties": false, "properties": {"freq": {"name": "Frequency", "description": "These fields encode which spatial dimension (1, 2, or 3).", "type": "integer"}, "phase": {"name": "Phase", "description": "Corresponds to which acquisition dimension for MRI data.", "type": "integer"}, "slice": {"name": "Slice", "description": "Slice dimensions.", "type": "integer"}}}, "dim": {"name": "Data Dimensions", "description": "Data seq dimensions.", "type": "array", "minItems": 8, "maxItems": 8, "items": {"type": "integer"}}, "pixdim": {"name": "Pixel Dimension", "description": "Grid spacings (unit per dimension).", "type": "array", "minItems": 8, "maxItems": 8, "items": {"type": "number"}}, "shape": {"name": "Data shape", "description": "Data array shape, equal to dim[1:dim[0] + 1]", "type": "array", "minItems": 0, "maxItems": 7, "items": {"type": "integer"}}, "voxel_sizes": {"name": "Voxel sizes", "description": "Voxel sizes, equal to pixdim[1:dim[0] + 1]", "type": "array", "minItems": 0, "maxItems": 7, "items": {"type": "number"}}, "xyzt_units": {"name": "XYZT Units", "description": "Units of pixdim[1..4]", "type": "object", "required": ["xyz", "t"], "additionalProperties": false, "properties": {"xyz": {"name": "XYZ Units", "description": "String representing the unit of voxel spacing.", "type": "string", "enum": ["unknown", "meter", "mm", "um"]}, "t": {"name": "Time Unit", "description": "String representing the unit of inter-volume intervals.", "type": "string", "enum": ["unknown", "sec", "msec", "usec"]}}}, "qform_code": {"name": "qform code", "description": "Use of the quaternion fields.", "type": "integer"}, "sform_code": {"name": "sform code", "description": "Use of the affine fields.", "type": "integer"}, "mrs": {"name": "NIfTI-MRS extension", "description": "NIfTI-MRS JSON fields", "type": "object"}}}, "ome": {"name": "Open Microscopy Environment fields", "description": "Parsed contents of OME-XML header, which may be found in OME-TIFF or OME-ZARR files", "type": "object", "additionalProperties": false, "properties": {"PhysicalSizeX": {"name": "PhysicalSizeX", "description": "Pixels / @PhysicalSizeX", "type": "number"}, "PhysicalSizeY": {"name": "PhysicalSizeY", "description": "Pixels / @PhysicalSizeY", "type": "number"}, "PhysicalSizeZ": {"name": "PhysicalSizeZ", "description": "Pixels / @PhysicalSizeZ", "type": "number"}, "PhysicalSizeXUnit": {"name": "PhysicalSizeXUnit", "description": "Pixels / @PhysicalSizeXUnit", "type": "string"}, "PhysicalSizeYUnit": {"name": "PhysicalSizeYUnit", "description": "Pixels / @PhysicalSizeYUnit", "type": "string"}, "PhysicalSizeZUnit": {"name": "PhysicalSizeZUnit", "description": "Pixels / @PhysicalSizeZUnit", "type": "string"}}}, "tiff": {"name": "TIFF", "description": "TIFF file format metadata", "type": "object", "required": ["version"], "additionalProperties": false, "properties": {"version": {"name": "Version", "description": "TIFF file format version (the second 2-byte block)", "type": "integer"}}}}}, "expression_tests": [{"expression": "sidecar.MissingValue", "result": null}, {"expression": "null.anything", "result": null}, {"expression": "(null)", "result": null}, {"expression": "null[0]", "result": null}, {"expression": "null && true", "result": null}, {"expression": "true && null", "result": null}, {"expression": "false && null", "result": false}, {"expression": "true || null", "result": true}, {"expression": "null || true", "result": true}, {"expression": "false || null", "result": null}, {"expression": "!null", "result": true}, {"expression": "intersects([], null)", "result": false}, {"expression": "intersects(null, [])", "result": false}, {"expression": "allequal([], null)", "result": false}, {"expression": "allequal(null, [])", "result": false}, {"expression": "match(null, 'pattern')", "result": null}, {"expression": "match('string', null)", "result": false}, {"expression": "substr(null, 1, 4)", "result": null}, {"expression": "substr('string', null, 4)", "result": null}, {"expression": "substr('string', 1, null)", "result": null}, {"expression": "min(null)", "result": null}, {"expression": "max(null)", "result": null}, {"expression": "length(null)", "result": null}, {"expression": "type(null)", "result": "null"}, {"expression": "null == false", "result": false}, {"expression": "null == true", "result": false}, {"expression": "null != false", "result": true}, {"expression": "null != true", "result": true}, {"expression": "null != 1.5", "result": true}, {"expression": "null == null", "result": true}, {"expression": "null == 1", "result": false}, {"expression": "\"VolumeTiming\" in null", "result": null}, {"expression": "exists(null, \"bids-uri\")", "result": 0}, {"expression": "exists([], null)", "result": 0}, {"expression": "true || sidecar.MissingValue", "result": true}, {"expression": "1 + 2", "result": 3}, {"expression": "\"cat\" + \"dog\"", "result": "catdog"}, {"expression": "match('string', '.*')", "result": true}, {"expression": "match('', '.')", "result": false}, {"expression": "substr('string', 1, 4)", "result": "tri"}, {"expression": "substr('string', 0, 20)", "result": "string"}, {"expression": "type(1)", "result": "number"}, {"expression": "type([])", "result": "array"}, {"expression": "type({})", "result": "object"}, {"expression": "type(true)", "result": "boolean"}, {"expression": "intersects([1], [1, 2])", "result": true}, {"expression": "intersects([1], [])", "result": false}, {"expression": "length([1, 2, 3])", "result": 3}, {"expression": "length([])", "result": 0}, {"expression": "count([1, 2, 3], 1)", "result": 1}, {"expression": "index([\"i\", \"j\", \"k\"], \"i\")", "result": 0}, {"expression": "index([\"i\", \"j\", \"k\"], \"j\")", "result": 1}, {"expression": "index([\"i\", \"j\", \"k\"], \"x\")", "result": null}, {"expression": "sorted([3, 2, 1])", "result": [1, 2, 3]}, {"expression": "sorted([1, 2, 5, 10], \"lexical\")", "result": [1, 10, 2, 5]}, {"expression": "sorted([\"1\", \"2\", \"5\", \"10\"])", "result": ["1", "10", "2", "5"]}, {"expression": "sorted([\"1\", \"2\", \"5\", \"10\"], \"numeric\")", "result": ["1", "2", "5", "10"]}, {"expression": "sorted([\"1\", \"2\", \"n/a\"], \"numeric\")", "result": ["1", "2", "n/a"]}, {"expression": "sorted([\"n/a\", \"2\", \"1\"], \"numeric\")", "result": ["n/a", "1", "2"]}, {"expression": "allequal(sorted([3, 2, 1]), [1, 2, 3])", "result": true}, {"expression": "sorted([9, 81, 729, 6561])", "result": [9, 81, 729, 6561]}, {"expression": "min([-1, \"n/a\", 1])", "result": -1}, {"expression": "max([-1, \"n/a\", 1])", "result": 1}, {"expression": "[3, 2, 1][0]", "result": 3}, {"expression": "\"string\"[0]", "result": "s"}], "versions": ["1.9.0", "1.8.0", "1.7.0", "1.6.0", "1.5.0", "1.4.1", "1.4.0", "1.3.0", "1.2.2", "1.2.1", "1.2.0", "1.1.2", "1.1.1", "1.1.0", "1.0.2", "1.0.1", "1.0.0"]}, "objects": {"columns": {"HED": {"name": "HED", "display_name": "HED Tag", "description": "Hierarchical Event Descriptor (HED) Tag.\nSee the [HED Appendix](SPEC_ROOT/appendices/hed.md) for details.\n", "type": "string"}, "abbreviation": {"name": "abbreviation", "display_name": "Abbreviation", "description": "The unique label abbreviation\n", "type": "string"}, "acq_time__scans": {"name": "acq_time", "display_name": "Scan acquisition time", "description": "Acquisition time refers to when the first data point in each run was acquired.\nFurthermore, if this header is provided, the acquisition times of all files\nfrom the same recording MUST be identical.\nDatetime format and their deidentification are described in\n[Units](SPEC_ROOT/common-principles.md#units).\n", "type": "string", "format": "datetime"}, "acq_time__sessions": {"name": "acq_time", "display_name": "Session acquisition time", "description": "Acquisition time refers to when the first data point of the first run was acquired.\nDatetime format and their deidentification are described in\n[Units](SPEC_ROOT/common-principles.md#units).\n", "type": "string", "format": "datetime"}, "age": {"name": "age", "display_name": "Subject age", "description": "Numeric value in years (float or integer value).\n\nIt is recommended to tag participant ages that are 89 or higher as 89+,\nfor privacy purposes.\n", "definition": {"LongName": "Subject age", "Description": "Subject age in postnatal years", "Units": "year"}}, "cardiac": {"name": "cardiac", "display_name": "Cardiac measurement", "description": "continuous pulse measurement\n", "definition": {"Description": "continuous pulse measurement", "Units": "mV"}}, "channel": {"name": "channel", "display_name": "Channel", "description": "Channel(s) associated with an event.\nIf multiple channels are specified, they MUST be separated by a delimiter\nspecified in the `\"Delimiter\"` field describing the `channel` column.\nFor example, channels separated with a comma (`,`) require the `events.json`\nfile to contain `\"channel\": {\"Delimiter\": \",\"}`.\nIn the absence of a delimiter, tools MUST interpret any character as being part\nof a channel name.\n", "type": "string"}, "color": {"name": "color", "display_name": "Color label", "description": "Hexadecimal. Label color for visualization.\n", "type": "string", "unit": "hexadecimal"}, "component": {"name": "component", "display_name": "Component", "description": "Description of the spatial axis or label of quaternion component associated with the channel.\nFor example, `x`,`y`,`z` for position channels,\nor `quat_x`, `quat_y`, `quat_z`, `quat_w` for quaternion orientation channels.\n", "type": "string", "enum": ["x", "y", "z", "quat_x", "quat_y", "quat_z", "quat_w"]}, "detector__channels": {"name": "detector", "display_name": "Detector Name", "description": "Name of the detector as specified in the `*_optodes.tsv` file.\n`n/a` for channels that do not contain NIRS signals (for example, acceleration).\n", "type": "string"}, "detector_type": {"name": "detector_type", "display_name": "Detector Type", "description": "The type of detector. Only to be used if the field `DetectorType` in `*_nirs.json` is set to `mixed`.\n", "anyOf": [{"type": "string"}]}, "derived_from": {"name": "derived_from", "display_name": "Derived from", "description": "`sample-