diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0cafdb3d8..321274a9d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -36,7 +36,7 @@ repos: - id: ruff - repo: https://github.com/tox-dev/pyproject-fmt - rev: 2.0.4 + rev: 2.1.3 hooks: - id: pyproject-fmt diff --git a/compliance_checker/protocols/netcdf.py b/compliance_checker/protocols/netcdf.py index be99d1e40..94cf7b594 100644 --- a/compliance_checker/protocols/netcdf.py +++ b/compliance_checker/protocols/netcdf.py @@ -25,18 +25,26 @@ def is_netcdf(url): if url.endswith("nc"): return True - # Brute force - with open(url, "rb") as f: - magic_number = f.read(4) - if len(magic_number) < 4: - return False - if is_classic_netcdf(magic_number): - return True - elif is_hdf5(magic_number): - return True - + try: + # Brute force + with open(url, "rb") as f: + magic_number = f.read(4) + if len(magic_number) < 4: + return False + if is_classic_netcdf(magic_number): + return True + elif is_hdf5(magic_number): + return True + except PermissionError: + # open will fail for both a directory or a local url, either of which may be pointing to a Zarr dataset + # directory + return False + except OSError: + # local file url return False + return False + def is_classic_netcdf(file_buffer): """ diff --git a/compliance_checker/protocols/opendap.py b/compliance_checker/protocols/opendap.py index c78363861..b69b3a2a3 100644 --- a/compliance_checker/protocols/opendap.py +++ b/compliance_checker/protocols/opendap.py @@ -55,14 +55,19 @@ def is_opendap(url): das_url = url.replace("#fillmismatch", ".das") else: das_url = url + ".das" - response = requests.get(das_url, allow_redirects=True) - if "xdods-server" in response.headers: - return True - # Check if it is an access restricted ESGF thredds service - if ( - response.status_code == 401 - and "text/html" in response.headers["content-type"] - and "The following URL requires authentication:" in response.text - ): - return True + + try: + response = requests.get(das_url, allow_redirects=True) + + if "xdods-server" in response.headers: + return True + # Check if it is an access restricted ESGF thredds service + if ( + response.status_code == 401 + and "text/html" in response.headers["content-type"] + and "The following URL requires authentication:" in response.text + ): + return True + except requests.exceptions.InvalidSchema: + return False # not opendap if url + ".das" isn't found return False diff --git a/compliance_checker/protocols/zarr.py b/compliance_checker/protocols/zarr.py new file mode 100644 index 000000000..8600f22c3 --- /dev/null +++ b/compliance_checker/protocols/zarr.py @@ -0,0 +1,80 @@ +import zipfile +from pathlib import Path +from urllib.parse import urlparse +from urllib.request import url2pathname +from zipfile import ZipFile + +from compliance_checker.protocols import netcdf + +# + + +def is_zarr(url): + """ """ + + if netcdf.is_netcdf(url): + return False + + if ".zarr" in url: + return True + + if urlparse(url).scheme in ("https", "s3", "file"): + return True + + if zipfile.is_zipfile(url): + if ".zmetadata" in ZipFile(url).namelist(): + return True + + if Path(url).is_dir(): + if (Path(url) / ".zmetadata").exists(): + return True + + return False + + +def as_zarr(url): + """ + Transform pointers to zarr datasets to valid nczarr urls, as described in + https://www.unidata.ucar.edu/blogs/developer/entry/overview-of-zarr-support-in\n + url: str or Path to valid zarr dataset\n + Distinct from is_cdl etc in that it will return the appropriate URI \n\n + + Not tested on Windows paths at the moment, as NCZarr is not supported in Windows\n + + A valid Zarr dataset could be provided in any of the following forms:\n + "http://s3.amazonaws.com/bucket/dataset.zarr"\n + "http://s3.amazonaws.com/bucket/dataset.zarr"#mode=nczarr,s3\n + "/home/path/to/dataset.zarr"\n + Path('/home/path/to/dataset.zarr')\n + "file:///home/path/to/dataset.zarr"\n + "file:///home/path/to/dataset.randomExt#mode=nczarr,file" + "file:///home/path/to/dataset.zarr#mode=nczarr,zip" + """ + + pr = urlparse(str(url)) + + if "mode=nczarr" in pr.fragment: + if pr.netloc: + return str(url) # already valid nczarr url + elif pr.scheme == "file": + return str(url) # already valid nczarr url + + zarr_url = Path( + url2pathname(pr.path), + ).resolve() # url2pathname necessary to avoid urlparse bug in windows + + if pr.netloc: + mode = "s3" + elif zipfile.is_zipfile(zarr_url): + mode = "zip" + elif zarr_url.is_dir(): + mode = "file" + else: + raise ValueError( + f"Could not identify {url},\nif #mode=nczarr,zarr, please pass this explicitly\nValid url options are described here\nhttps://www.unidata.ucar.edu/blogs/developer/entry/overview-of-zarr-support-in", + ) + + url_base = url if mode == "s3" else zarr_url.as_uri() + + zarr_url = f"{url_base}#mode=nczarr,{mode}" + return zarr_url diff --git a/compliance_checker/suite.py b/compliance_checker/suite.py index 17002da41..60008488b 100644 --- a/compliance_checker/suite.py +++ b/compliance_checker/suite.py @@ -6,6 +6,7 @@ import inspect import itertools import os +import platform import re import subprocess import sys @@ -27,7 +28,7 @@ from compliance_checker import __version__, tempnc from compliance_checker.base import BaseCheck, GenericFile, Result, fix_return_value -from compliance_checker.protocols import cdl, netcdf, opendap +from compliance_checker.protocols import cdl, netcdf, opendap, zarr # Ensure output is encoded as Unicode when checker output is redirected or piped if sys.stdout.encoding is None: @@ -890,6 +891,13 @@ def load_local_dataset(self, ds_str): if cdl.is_cdl(ds_str): ds_str = self.generate_dataset(ds_str) + if zarr.is_zarr(ds_str): + if platform.system() != "Linux": + print( + f"WARNING: {platform.system()} OS detected. NCZarr is not officially supported for your OS as of when this API was written. Your mileage may vary.", + ) + return Dataset(zarr.as_zarr(ds_str)) + if netcdf.is_netcdf(ds_str): return Dataset(ds_str) diff --git a/compliance_checker/tests/data/trajectory.zarr/.zattrs b/compliance_checker/tests/data/trajectory.zarr/.zattrs new file mode 100644 index 000000000..9e26dfeeb --- /dev/null +++ b/compliance_checker/tests/data/trajectory.zarr/.zattrs @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/compliance_checker/tests/data/trajectory.zarr/.zgroup b/compliance_checker/tests/data/trajectory.zarr/.zgroup new file mode 100644 index 000000000..3b7daf227 --- /dev/null +++ b/compliance_checker/tests/data/trajectory.zarr/.zgroup @@ -0,0 +1,3 @@ +{ + "zarr_format": 2 +} \ No newline at end of file diff --git a/compliance_checker/tests/data/trajectory.zarr/.zmetadata b/compliance_checker/tests/data/trajectory.zarr/.zmetadata new file mode 100644 index 000000000..129505f92 --- /dev/null +++ b/compliance_checker/tests/data/trajectory.zarr/.zmetadata @@ -0,0 +1,169 @@ +{ + "metadata": { + ".zattrs": {}, + ".zgroup": { + "zarr_format": 2 + }, + "lat/.zarray": { + "chunks": [ + 2, + 3 + ], + "compressor": { + "blocksize": 0, + "clevel": 5, + "cname": "lz4", + "id": "blosc", + "shuffle": 1 + }, + "dtype": "