diff --git a/.github/workflows/update-cv.yaml b/.github/workflows/update-cv.yaml index 60bf9c1..4f9cf87 100644 --- a/.github/workflows/update-cv.yaml +++ b/.github/workflows/update-cv.yaml @@ -8,6 +8,7 @@ on: push: paths: - 'CORDEX-CMIP6_*.json' + - 'scripts/*.py' branches: - main @@ -43,6 +44,10 @@ jobs: create-cv mv CORDEX-CMIP6_CV.json tables/Tables + - name: Create JSON schema + run: | + create-json-schema + - name: Check if there are any changes id: verify_diff run: | @@ -52,10 +57,10 @@ jobs: - name: Commit and push if: steps.verify_diff.outputs.changed == 'true' run: | - cd tables git config --local user.email "github-actions[bot]@users.noreply.github.com" git config --local user.name "github-actions[bot]" (pre-commit run --all-files) || true - git commit Tables/CORDEX-CMIP6_CV.json -m "CV update" + git commit tables/Tables/CORDEX-CMIP6_CV.json -m "CV update" + git commit cmip6-cordex-global-attrs-schema.json -m "JSON schema update" git status git push diff --git a/scripts/cordex_cv/__init__.py b/scripts/cordex_cv/__init__.py index be9f0c8..7d7f67c 100644 --- a/scripts/cordex_cv/__init__.py +++ b/scripts/cordex_cv/__init__.py @@ -1,5 +1,12 @@ from .cv import create_cv, update_table from .domain_id import create_domain_id from .driving_source_id import create_driving_source_id +from .cv2schema import create_json_schema -__all__ = ["create_cv", "create_domain_id", "update_table", "create_driving_source_id"] +__all__ = [ + "create_cv", + "create_domain_id", + "update_table", + "create_driving_source_id", + "create_json_schema", +] diff --git a/scripts/cordex_cv/cli.py b/scripts/cordex_cv/cli.py index 71abc56..afa0750 100644 --- a/scripts/cordex_cv/cli.py +++ b/scripts/cordex_cv/cli.py @@ -1,4 +1,4 @@ -from . import create_cv, create_domain_id, create_driving_source_id +from . import create_cv, create_domain_id, create_driving_source_id, create_json_schema def cv(): @@ -14,3 +14,8 @@ def domain_id(): def driving_source_id(): print("creating driving_source_id") create_driving_source_id() + + +def json_schema(): + print("creating json schema") + create_json_schema() diff --git a/scripts/cordex_cv/cv2schema.py b/scripts/cordex_cv/cv2schema.py new file mode 100644 index 0000000..0c1a549 --- /dev/null +++ b/scripts/cordex_cv/cv2schema.py @@ -0,0 +1,128 @@ +import json + +""" +# Export the CVs to JSON-schema. + +The function `make_global_attrs_schema` reads the CVs in the root directory and return a JSON schema. This schema can +then be used to validate global attributes from CORDEX simulations. + +For example + +``` +import jsonschema +import xarray as xr +ds = xr.open_dataset("") +schema = make_global_attrs_schema() +jsonschema.validate(ds.attrs, schema) +``` + +Any missing or incorrect global attribute will raise a `ValidationError`. +""" + + +def make_global_attrs_schema(prefix: str = None, enum: bool = False) -> dict: + """Create a JSON schema for netCDF global attributes from the JSON CVs. + + Parameters + ---------- + prefix : str + Prefix to add to all properties. + enum : bool + If True, return an enum schema instead of oneOf, leading to smaller, easier to read schemas. + + Returns + ------- + dict + JSON schema for global attributes. + """ + prefix = prefix + ":" if prefix else "" + + # Read required global attributes + reqs = read_cv("required_global_attributes")["required_global_attributes"] + + schema = { + "$schema": "http://json-schema.org/draft-07/schema#", + "$id": "cmip6-cordex-global-attrs-schema.json#", + "title": "CORDEX-CMIP6 metadata schema for global attributes", + "description": "JSON schema for global attributes of CORDEX-CMIP6 datasets. This schema is automatically " + "generated from the CVs. Manual edits will be overwritten.", + "type": "object", + "properties": {}, + "required": [prefix + fid for fid in reqs], + } + + integer_fields = [] + + formats = {"creation_date": "date-time"} + + props = {} + for fid in reqs: + if fid in integer_fields: + # Could be replaced by patternProperties, but at the expense of readability + props[prefix + fid] = {"type": "integer"} + else: + try: + cv = read_cv(fid) + for key, val in cv_to_property(cv, enum=enum).items(): + props[prefix + key] = val + except FileNotFoundError: + props[prefix + fid] = {"type": "string"} + + if fid in formats: + props[prefix + fid]["format"] = formats[fid] + + schema["properties"].update(props) + + return schema + + +def cv_to_property(cv: dict, enum: bool = False) -> dict: + """Convert a CV to a JSON schema property. + + Parameters + ---------- + cv : dict + CV dictionary. + enum: bool + If True, return an enum schema instead of oneOf. + """ + if len(cv) > 1: + raise ValueError("CV has more than one key.") + + field = { + "source_id": "label", + "experiment_id": "description", + "domain_id": "domain", + "driving_source_id": "driving_source", + } + + out = {} + for fid, keys in cv.items(): + items = [] + if isinstance(keys, dict): + for key, value in keys.items(): + if isinstance(value, str): + items.append({"const": key, "title": value}) + elif isinstance(value, dict): + items.append({"const": key, "title": value.get(field[fid], "")}) + if enum: + out[fid] = {"enum": [item["const"] for item in items]} + else: + out[fid] = {"oneOf": items} + elif isinstance(keys, list): + out[fid] = {"enum": keys} + return out + + +def read_cv(key: str) -> dict: + """Read a CV file and return it as a dictionary.""" + path = f"CORDEX-CMIP6_{key}.json" + with open(path) as f: + return json.load(f) + + +def create_json_schema(): + from .common import write_json + + schema = make_global_attrs_schema(prefix="cordex6", enum=True) + write_json("cmip6-cordex-global-attrs-schema.json", schema) diff --git a/scripts/pyproject.toml b/scripts/pyproject.toml index f2c9d85..0eefbc3 100644 --- a/scripts/pyproject.toml +++ b/scripts/pyproject.toml @@ -16,6 +16,7 @@ dependencies = [ create-cv = "cordex_cv.cli:create_cv" create-domain_id = "cordex_cv.cli:domain_id" create-driving_source_id = "cordex_cv.cli:driving_source_id" +create-json-schema = "cordex_cv.cli:json_schema" [tool.setuptools]