Skip to content

Commit

Permalink
Merge pull request #188 from Ouranosinc/cv2json
Browse files Browse the repository at this point in the history
Generate json schema out of CV
  • Loading branch information
larsbuntemeyer authored Oct 9, 2024
2 parents d4239e7 + 43e96f8 commit 2485ba4
Show file tree
Hide file tree
Showing 5 changed files with 150 additions and 4 deletions.
9 changes: 7 additions & 2 deletions .github/workflows/update-cv.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ on:
push:
paths:
- 'CORDEX-CMIP6_*.json'
- 'scripts/*.py'
branches:
- main

Expand Down Expand Up @@ -43,6 +44,10 @@ jobs:
create-cv
mv CORDEX-CMIP6_CV.json tables/Tables
- name: Create JSON schema
run: |
create-json-schema
- name: Check if there are any changes
id: verify_diff
run: |
Expand All @@ -52,10 +57,10 @@ jobs:
- name: Commit and push
if: steps.verify_diff.outputs.changed == 'true'
run: |
cd tables
git config --local user.email "github-actions[bot]@users.noreply.github.com"
git config --local user.name "github-actions[bot]"
(pre-commit run --all-files) || true
git commit Tables/CORDEX-CMIP6_CV.json -m "CV update"
git commit tables/Tables/CORDEX-CMIP6_CV.json -m "CV update"
git commit cmip6-cordex-global-attrs-schema.json -m "JSON schema update"
git status
git push
9 changes: 8 additions & 1 deletion scripts/cordex_cv/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
from .cv import create_cv, update_table
from .domain_id import create_domain_id
from .driving_source_id import create_driving_source_id
from .cv2schema import create_json_schema

__all__ = ["create_cv", "create_domain_id", "update_table", "create_driving_source_id"]
__all__ = [
"create_cv",
"create_domain_id",
"update_table",
"create_driving_source_id",
"create_json_schema",
]
7 changes: 6 additions & 1 deletion scripts/cordex_cv/cli.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from . import create_cv, create_domain_id, create_driving_source_id
from . import create_cv, create_domain_id, create_driving_source_id, create_json_schema


def cv():
Expand All @@ -14,3 +14,8 @@ def domain_id():
def driving_source_id():
print("creating driving_source_id")
create_driving_source_id()


def json_schema():
print("creating json schema")
create_json_schema()
128 changes: 128 additions & 0 deletions scripts/cordex_cv/cv2schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
import json

"""
# Export the CVs to JSON-schema.
The function `make_global_attrs_schema` reads the CVs in the root directory and return a JSON schema. This schema can
then be used to validate global attributes from CORDEX simulations.
For example
```
import jsonschema
import xarray as xr
ds = xr.open_dataset("<path to netCDF file>")
schema = make_global_attrs_schema()
jsonschema.validate(ds.attrs, schema)
```
Any missing or incorrect global attribute will raise a `ValidationError`.
"""


def make_global_attrs_schema(prefix: str = None, enum: bool = False) -> dict:
"""Create a JSON schema for netCDF global attributes from the JSON CVs.
Parameters
----------
prefix : str
Prefix to add to all properties.
enum : bool
If True, return an enum schema instead of oneOf, leading to smaller, easier to read schemas.
Returns
-------
dict
JSON schema for global attributes.
"""
prefix = prefix + ":" if prefix else ""

# Read required global attributes
reqs = read_cv("required_global_attributes")["required_global_attributes"]

schema = {
"$schema": "http://json-schema.org/draft-07/schema#",
"$id": "cmip6-cordex-global-attrs-schema.json#",
"title": "CORDEX-CMIP6 metadata schema for global attributes",
"description": "JSON schema for global attributes of CORDEX-CMIP6 datasets. This schema is automatically "
"generated from the CVs. Manual edits will be overwritten.",
"type": "object",
"properties": {},
"required": [prefix + fid for fid in reqs],
}

integer_fields = []

formats = {"creation_date": "date-time"}

props = {}
for fid in reqs:
if fid in integer_fields:
# Could be replaced by patternProperties, but at the expense of readability
props[prefix + fid] = {"type": "integer"}
else:
try:
cv = read_cv(fid)
for key, val in cv_to_property(cv, enum=enum).items():
props[prefix + key] = val
except FileNotFoundError:
props[prefix + fid] = {"type": "string"}

if fid in formats:
props[prefix + fid]["format"] = formats[fid]

schema["properties"].update(props)

return schema


def cv_to_property(cv: dict, enum: bool = False) -> dict:
"""Convert a CV to a JSON schema property.
Parameters
----------
cv : dict
CV dictionary.
enum: bool
If True, return an enum schema instead of oneOf.
"""
if len(cv) > 1:
raise ValueError("CV has more than one key.")

field = {
"source_id": "label",
"experiment_id": "description",
"domain_id": "domain",
"driving_source_id": "driving_source",
}

out = {}
for fid, keys in cv.items():
items = []
if isinstance(keys, dict):
for key, value in keys.items():
if isinstance(value, str):
items.append({"const": key, "title": value})
elif isinstance(value, dict):
items.append({"const": key, "title": value.get(field[fid], "")})
if enum:
out[fid] = {"enum": [item["const"] for item in items]}
else:
out[fid] = {"oneOf": items}
elif isinstance(keys, list):
out[fid] = {"enum": keys}
return out


def read_cv(key: str) -> dict:
"""Read a CV file and return it as a dictionary."""
path = f"CORDEX-CMIP6_{key}.json"
with open(path) as f:
return json.load(f)


def create_json_schema():
from .common import write_json

schema = make_global_attrs_schema(prefix="cordex6", enum=True)
write_json("cmip6-cordex-global-attrs-schema.json", schema)
1 change: 1 addition & 0 deletions scripts/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ dependencies = [
create-cv = "cordex_cv.cli:create_cv"
create-domain_id = "cordex_cv.cli:domain_id"
create-driving_source_id = "cordex_cv.cli:driving_source_id"
create-json-schema = "cordex_cv.cli:json_schema"


[tool.setuptools]
Expand Down

0 comments on commit 2485ba4

Please sign in to comment.