Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Start adding schema to CuBIDS #392

Draft
wants to merge 13 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions cubids/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import importlib.resources
import yaml


def load_config(config_file):
"""Load a YAML file containing a configuration for param groups.

Expand All @@ -24,3 +25,32 @@ def load_config(config_file):
config = yaml.safe_load(f)

return config


def load_schema(schema_file):
"""Load a JSON file containing the BIDS schema.

Parameters
----------
schema_file : str or pathlib.Path, optional
The path to the schema file. If None, the default schema file is used.

Returns
-------
dict
The schema loaded from the YAML file.
"""
import json

if schema_file is None:
schema_file = Path(importlib.resources.files("cubids") / "data/schema.json")

with schema_file.open() as f:
schema = json.load(f)

print(
f"Loading BIDS schema version: {schema['schema_version']}. "
f"BIDS version: {schema['bids_version']}"
)

return schema
112 changes: 82 additions & 30 deletions cubids/cubids.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from sklearn.cluster import AgglomerativeClustering
from tqdm import tqdm

from cubids.config import load_config
from cubids.config import load_config, load_schema
from cubids.constants import ID_VARS, NON_KEY_ENTITIES
from cubids.metadata_merge import check_merging_operations, group_by_acquisition_sets

Expand All @@ -47,6 +47,9 @@ class CuBIDS(object):
force_unlock : :obj:`bool`, optional
If True, force unlock all files in the BIDS dataset.
Default is False.
schema_json : :obj:`str`, optional
Path to a BIDS schema JSON file.
Default is None, in which case the default schema in CuBIDS is used.

Attributes
----------
Expand Down Expand Up @@ -93,6 +96,7 @@ def __init__(
acq_group_level="subject",
grouping_config=None,
force_unlock=False,
schema_json=None,
):
self.path = os.path.abspath(data_root)
self._layout = None
Expand All @@ -110,6 +114,8 @@ def __init__(
self.cubids_code_dir = Path(self.path + "/code/CuBIDS").is_dir()
self.data_dict = {} # data dictionary for TSV outputs
self.use_datalad = use_datalad # True if flag set, False if flag unset
self.schema = load_schema(schema_json)

if self.use_datalad:
self.init_datalad()

Expand Down Expand Up @@ -471,8 +477,9 @@ def change_filename(self, filepath, entities):
"""
new_path = build_path(
filepath=filepath,
entities=entities,
out_entities=entities,
out_dir=str(self.path),
schema=self.schema,
)

exts = Path(filepath).suffixes
Expand Down Expand Up @@ -1734,18 +1741,30 @@ def get_entity_value(path, key):
return part


def build_path(filepath, entities, out_dir):
def build_path(filepath, out_entities, out_dir, schema):
"""Build a new path for a file based on its BIDS entities.

This function could ultimately be replaced with bids.BIDSLayout.build_path(),
but that method doesn't use the schema.

Parameters
----------
filepath : str
The original file path.
entities : dict
out_entities : dict
A dictionary of BIDS entities.
This should include all of the entities in the filename *except* for subject and session.
out_dir : str
The output directory for the new file.
schema : dict
The BIDS schema. The elements that are used in this function include:

- schema["rules"]["entities"]: a list of valid BIDS entities,
in the order they must appear in filenames.
- schema["objects"]["entities"]: a dictionary mapping entity names
(e.g., acquisition) to their corresponding keys (e.g., acq).
- schema["objects"]["datatypes"]: a dictionary defining the valid datatypes.
This function only uses the keys of this dictionary.

Returns
-------
Expand All @@ -1754,103 +1773,143 @@ def build_path(filepath, entities, out_dir):

Examples
--------
>>> import json
>>> import importlib
>>> schema_file = Path(importlib.resources.files("cubids") / "data/schema.json")
>>> with schema_file.open() as f:
... schema = json.load(f)
>>> build_path(
... "/input/sub-01/ses-01/anat/sub-01_ses-01_T1w.nii.gz",
... {"acquisition": "VAR", "suffix": "T2w"},
... "/output",
... schema,
... )
'/output/sub-01/ses-01/anat/sub-01_ses-01_acq-VAR_T2w.nii.gz'

The function does not add an extra leading zero to the run entity when it's a string.

>>> build_path(
... "/input/sub-01/ses-01/func/sub-01_ses-01_task-rest_run-01_bold.nii.gz",
... {"task": "rest", "run": "2", "acquisition": "VAR", "suffix": "bold"},
... "/output",
... schema,
... )
'/output/sub-01/ses-01/func/sub-01_ses-01_task-rest_acq-VAR_run-2_bold.nii.gz'

The function adds an extra leading zero to the run entity when it's an integer
and the original filename has a leading zero.

>>> build_path(
... "/input/sub-01/ses-01/func/sub-01_ses-01_task-rest_run-00001_bold.nii.gz",
... {"task": "rest", "run": 2, "acquisition": "VAR", "suffix": "bold"},
... "/output",
... schema,
... )
'/output/sub-01/ses-01/func/sub-01_ses-01_task-rest_acq-VAR_run-00002_bold.nii.gz'

The function does not add an extra leading zero to the run entity when it's an integer
and the original filename doesn't have a leading zero.

>>> build_path(
... "/input/sub-01/ses-01/func/sub-01_ses-01_task-rest_run-1_bold.nii.gz",
... {"task": "rest", "run": 2, "acquisition": "VAR", "suffix": "bold"},
... "/output",
... schema,
... )
'/output/sub-01/ses-01/func/sub-01_ses-01_task-rest_acq-VAR_run-2_bold.nii.gz'

The function doesn't add an extra leading zero to the run entity when there isn't a zero.

>>> build_path(
... "/input/sub-01/ses-01/func/sub-01_ses-01_task-rest_run-1_bold.nii.gz",
... {"task": "rest", "run": "2", "acquisition": "VAR", "suffix": "bold"},
... "/output",
... schema,
... )
'/output/sub-01/ses-01/func/sub-01_ses-01_task-rest_acq-VAR_run-2_bold.nii.gz'

Entities in the original path, but not the entity dictionary, are not included,
like run in this case.

>>> build_path(
... "/input/sub-01/ses-01/func/sub-01_ses-01_task-rest_run-01_bold.nii.gz",
... {"task": "rest", "acquisition": "VAR", "suffix": "bold"},
... "/output",
... schema,
... )
'/output/sub-01/ses-01/func/sub-01_ses-01_task-rest_acq-VAR_bold.nii.gz'

Entities outside of the prescribed list are ignored, such as "subject"...
The "subject" and "session" entities are ignored.

>>> build_path(
... "/input/sub-01/ses-01/func/sub-01_ses-01_task-rest_run-01_bold.nii.gz",
... {"subject": "02", "task": "rest", "acquisition": "VAR", "suffix": "bold"},
... "/output",
... schema,
... )
'/output/sub-01/ses-01/func/sub-01_ses-01_task-rest_acq-VAR_bold.nii.gz'

or "echo".
But uncommon (but BIDS-valid) entities, like echo, will work.

>>> build_path(
... "/input/sub-01/ses-01/func/sub-01_ses-01_task-rest_run-01_bold.nii.gz",
... {"task": "rest", "acquisition": "VAR", "echo": 1, "suffix": "bold"},
... "/output",
... schema,
... )
'/output/sub-01/ses-01/func/sub-01_ses-01_task-rest_acq-VAR_bold.nii.gz'
'/output/sub-01/ses-01/func/sub-01_ses-01_task-rest_acq-VAR_echo-1_bold.nii.gz'

It can change the datatype, but will warn the user.

>>> build_path(
... "/input/sub-01/ses-01/anat/sub-01_ses-01_asl.nii.gz",
... {"datatype": "perf", "acquisition": "VAR", "suffix": "asl"},
... "/output",
... schema,
... )
WARNING: DATATYPE CHANGE DETECTED
'/output/sub-01/ses-01/perf/sub-01_ses-01_acq-VAR_asl.nii.gz'

The datatype change is subject to false positives.

>>> build_path(
... "/input/sub-01/ses-01/func/sub-01_ses-01_task-meg_bold.nii.gz",
... {"datatype": "func", "acquisition": "VAR", "task": "meg", "suffix": "bold"},
... "/output",
... schema,
... )
WARNING: DATATYPE CHANGE DETECTED
'/output/sub-01/ses-01/func/sub-01_ses-01_task-meg_acq-VAR_bold.nii.gz'

It expects a longitudinal structure, so providing a cross-sectional filename won't work.
XXX: This is a bug.

>>> build_path(
... "/input/sub-01/func/sub-01_task-rest_run-01_bold.nii.gz",
... {"task": "rest", "acquisition": "VAR", "echo": 1, "suffix": "bold"},
... "/output",
... schema,
... )
Traceback (most recent call last):
ValueError: Could not extract subject or session from ...
"""
exts = Path(filepath).suffixes
old_ext = "".join(exts)

suffix = entities["suffix"]
entity_file_keys = []
suffix = out_entities["suffix"]

# Entities that may be in the filename?
file_keys = ["task", "acquisition", "direction", "reconstruction", "run"]
valid_entities = schema["rules"]["entities"]
entity_names_to_keys = entity_names_to_keys = {
k: v["name"] for k, v in schema["objects"]["entities"].items()
}
valid_datatypes = list(schema["objects"]["datatypes"].keys())

for key in file_keys:
if key in list(entities.keys()):
entity_file_keys.append(key)
# Remove subject and session from the entities
file_entities = {k: v for k, v in out_entities.items() if k not in ["subject", "session"]}
# Limit file entities to valid entities from BIDS (sorted in right order)
file_entities = {k: out_entities[k] for k in valid_entities if k in file_entities}
# Replace entity names with keys (e.g., acquisition with acq)
file_entities = {entity_names_to_keys[k]: v for k, v in file_entities.items()}

sub = get_entity_value(filepath, "sub")
ses = get_entity_value(filepath, "ses")
Expand All @@ -1859,39 +1918,32 @@ def build_path(filepath, entities, out_dir):

# Add leading zeros to run entity if it's an integer.
# If it's a string, respect the value provided.
if "run" in entities.keys() and isinstance(entities["run"], int):
if "run" in file_entities.keys() and isinstance(file_entities["run"], int):
# Infer the number of leading zeros needed from the original filename
n_leading = 2 # default to 1 leading zero
if "_run-" in filepath:
run_str = filepath.split("_run-")[1].split("_")[0]
n_leading = len(run_str)
entities["run"] = str(entities["run"]).zfill(n_leading)
file_entities["run"] = str(file_entities["run"]).zfill(n_leading)

filename = "_".join([f"{key}-{entities[key]}" for key in entity_file_keys])
filename = (
filename.replace("acquisition", "acq")
.replace("direction", "dir")
.replace("reconstruction", "rec")
)
filename = "_".join([f"{key}-{value}" for key, value in file_entities.items()])
if len(filename) > 0:
filename = f"{sub}_{ses}_{filename}_{suffix}{old_ext}"
else:
raise ValueError(f"Could not construct new filename for {filepath}")

# CHECK TO SEE IF DATATYPE CHANGED
# datatype may be overridden/changed if the original file is located in the wrong folder.
dtypes = ["anat", "func", "perf", "fmap", "dwi"]
# XXX: This check for the datatype is fragile and should be improved.
# For example, what if we have sub-01/func/sub-01_task-anatomy_bold.nii.gz?
dtype_orig = ""
for dtype in dtypes:
for dtype in valid_datatypes:
if dtype in filepath:
dtype_orig = dtype

if "datatype" in entities.keys():
dtype_new = entities["datatype"]
if entities["datatype"] != dtype_orig:
print("WARNING: DATATYPE CHANGE DETECTED")
else:
dtype_new = dtype_orig
dtype_new = out_entities.get("datatype", dtype_orig)
if dtype_new != dtype_orig:
print("WARNING: DATATYPE CHANGE DETECTED")

# Construct the new filename
new_path = str(Path(out_dir) / sub / ses / dtype_new / filename)
Expand Down
Loading
Loading