diff --git a/.github/workflows/default-tests.yml b/.github/workflows/default-tests.yml index df0dc0bf..5842a940 100644 --- a/.github/workflows/default-tests.yml +++ b/.github/workflows/default-tests.yml @@ -26,7 +26,7 @@ jobs: - name: Python ${{ matrix.python-version }} shell: bash -l {0} run: | - conda create --name TEST python=${{ matrix.python-version }} pip "libnetcdf<4.8.0" --file requirements.txt --file test_requirements.txt --strict-channel-priority + conda create --name TEST python=${{ matrix.python-version }} pip --file requirements.txt --file test_requirements.txt --strict-channel-priority source activate TEST pip install -e . --no-deps --force-reinstall diff --git a/compliance_checker/protocols/netcdf.py b/compliance_checker/protocols/netcdf.py index 65c58748..035f3a0d 100644 --- a/compliance_checker/protocols/netcdf.py +++ b/compliance_checker/protocols/netcdf.py @@ -5,6 +5,10 @@ Functions to assist in determining if the URL points to a netCDF file """ +import zipfile + +from pathlib import Path + import requests @@ -22,17 +26,25 @@ def is_netcdf(url): if url.endswith("nc"): return True - # Brute force - with open(url, "rb") as f: - magic_number = f.read(4) - if len(magic_number) < 4: - return False - if is_classic_netcdf(magic_number): - return True - elif is_hdf5(magic_number): - return True - + try: + # Brute force + with open(url, "rb") as f: + magic_number = f.read(4) + if len(magic_number) < 4: + return False + if is_classic_netcdf(magic_number): + return True + elif is_hdf5(magic_number): + return True + except PermissionError: + # open will fail for both a directory or a local url, either of which may be pointing to a Zarr dataset + # directory return False + except OSError: + # local file url + return False + + return False def is_classic_netcdf(file_buffer): diff --git a/compliance_checker/protocols/opendap.py b/compliance_checker/protocols/opendap.py index 3ae4e890..32066783 100644 --- a/compliance_checker/protocols/opendap.py +++ b/compliance_checker/protocols/opendap.py @@ -55,14 +55,19 @@ def is_opendap(url): das_url = url.replace("#fillmismatch", ".das") else: das_url = url + ".das" - response = requests.get(das_url, allow_redirects=True) - if "xdods-server" in response.headers: - return True - # Check if it is an access restricted ESGF thredds service - if ( - response.status_code == 401 - and "text/html" in response.headers["content-type"] - and "The following URL requires authentication:" in response.text - ): - return True + + try: + response = requests.get(das_url, allow_redirects=True) + + if "xdods-server" in response.headers: + return True + # Check if it is an access restricted ESGF thredds service + if ( + response.status_code == 401 + and "text/html" in response.headers["content-type"] + and "The following URL requires authentication:" in response.text + ): + return True + except requests.exceptions.InvalidSchema: + return False # not opendap if url + ".das" isn't found return False diff --git a/compliance_checker/protocols/zarr.py b/compliance_checker/protocols/zarr.py new file mode 100644 index 00000000..18e7d76c --- /dev/null +++ b/compliance_checker/protocols/zarr.py @@ -0,0 +1,82 @@ +import zipfile + +from pathlib import Path +from urllib.parse import urlparse +from urllib.request import url2pathname +from zipfile import ZipFile + +from compliance_checker.protocols import netcdf + + +# + + +def is_zarr(url): + """ """ + + if netcdf.is_netcdf(url): + return False + + if ".zarr" in url: + return True + + if urlparse(url).scheme in ("https", "s3", "file"): + return True + + if zipfile.is_zipfile(url): + if ".zmetadata" in ZipFile(url).namelist(): + return True + + if Path(url).is_dir(): + if (Path(url) / ".zmetadata").exists(): + return True + + return False + + +def as_zarr(url): + """ + Transform pointers to zarr datasets to valid nczarr urls, as described in + https://www.unidata.ucar.edu/blogs/developer/entry/overview-of-zarr-support-in\n + url: str or Path to valid zarr dataset\n + Distinct from is_cdl etc in that it will return the appropriate URI \n\n + + Not tested on Windows paths at the moment, as NCZarr is not supported in Windows\n + + A valid Zarr dataset could be provided in any of the following forms:\n + "http://s3.amazonaws.com/bucket/dataset.zarr"\n + "http://s3.amazonaws.com/bucket/dataset.zarr"#mode=nczarr,s3\n + "/home/path/to/dataset.zarr"\n + Path('/home/path/to/dataset.zarr')\n + "file:///home/path/to/dataset.zarr"\n + "file:///home/path/to/dataset.randomExt#mode=nczarr,file" + "file:///home/path/to/dataset.zarr#mode=nczarr,zip" + """ + + pr = urlparse(str(url)) + + if "mode=nczarr" in pr.fragment: + if pr.netloc: + return str(url) # already valid nczarr url + elif pr.scheme == "file": + return str(url) # already valid nczarr url + + zarr_url = Path( + url2pathname(pr.path) + ).resolve() # url2pathname necessary to avoid urlparse bug in windows + + if pr.netloc: + mode = "s3" + elif zipfile.is_zipfile(zarr_url): + mode = "zip" + elif zarr_url.is_dir(): + mode = "file" + else: + raise ValueError( + f"Could not identify {url},\nif #mode=nczarr,zarr, please pass this explicitly\nValid url options are described here\nhttps://www.unidata.ucar.edu/blogs/developer/entry/overview-of-zarr-support-in" + ) + + url_base = url if mode == "s3" else zarr_url.as_uri() + + zarr_url = f"{url_base}#mode=nczarr,{mode}" + return zarr_url diff --git a/compliance_checker/suite.py b/compliance_checker/suite.py index 26c460ae..05a5c033 100644 --- a/compliance_checker/suite.py +++ b/compliance_checker/suite.py @@ -6,6 +6,7 @@ import inspect import itertools import os +import platform import re import subprocess import sys @@ -16,7 +17,9 @@ from datetime import datetime, timezone from distutils.version import StrictVersion from operator import itemgetter -from urllib.parse import urlparse +from pathlib import Path +from urllib.parse import urljoin, urlparse +from urllib.request import url2pathname import requests @@ -29,7 +32,7 @@ from compliance_checker import MemoizedDataset, __version__, tempnc from compliance_checker.base import BaseCheck, GenericFile, Result, fix_return_value from compliance_checker.cf.cf import CFBaseCheck -from compliance_checker.protocols import cdl, erddap, netcdf, opendap +from compliance_checker.protocols import cdl, erddap, netcdf, opendap, zarr # Ensure output is encoded as Unicode when checker output is redirected or piped @@ -845,6 +848,13 @@ def load_local_dataset(self, ds_str): if cdl.is_cdl(ds_str): ds_str = self.generate_dataset(ds_str) + if zarr.is_zarr(ds_str): + if platform.system() != "Linux": + print( + f"WARNING: {platform.system()} OS detected. NCZarr is not officially supported for your OS as of when this API was written. Your mileage may vary." + ) + return MemoizedDataset(zarr.as_zarr(ds_str)) + if netcdf.is_netcdf(ds_str): return MemoizedDataset(ds_str) diff --git a/compliance_checker/tests/conftest.py b/compliance_checker/tests/conftest.py index d51d3898..a6f47174 100644 --- a/compliance_checker/tests/conftest.py +++ b/compliance_checker/tests/conftest.py @@ -24,14 +24,15 @@ def generate_dataset(cdl_path, nc_path): subprocess.call(["ncgen", "-o", str(nc_path), str(cdl_path)]) +datadir = Path(resource_filename("compliance_checker", "tests/data")).resolve() +assert datadir.exists(), f"{datadir} not found" + + def static_files(cdl_stem): """ Returns the Path to a valid nc dataset\n replaces the old STATIC_FILES dict """ - datadir = Path(resource_filename("compliance_checker", "tests/data")).resolve() - assert datadir.exists(), f"{datadir} not found" - cdl_paths = glob_down(datadir, f"{cdl_stem}.cdl", 3) assert ( len(cdl_paths) > 0 diff --git a/compliance_checker/tests/data/trajectory.zarr/.zattrs b/compliance_checker/tests/data/trajectory.zarr/.zattrs new file mode 100644 index 00000000..9e26dfee --- /dev/null +++ b/compliance_checker/tests/data/trajectory.zarr/.zattrs @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/compliance_checker/tests/data/trajectory.zarr/.zgroup b/compliance_checker/tests/data/trajectory.zarr/.zgroup new file mode 100644 index 00000000..3b7daf22 --- /dev/null +++ b/compliance_checker/tests/data/trajectory.zarr/.zgroup @@ -0,0 +1,3 @@ +{ + "zarr_format": 2 +} \ No newline at end of file diff --git a/compliance_checker/tests/data/trajectory.zarr/.zmetadata b/compliance_checker/tests/data/trajectory.zarr/.zmetadata new file mode 100644 index 00000000..129505f9 --- /dev/null +++ b/compliance_checker/tests/data/trajectory.zarr/.zmetadata @@ -0,0 +1,169 @@ +{ + "metadata": { + ".zattrs": {}, + ".zgroup": { + "zarr_format": 2 + }, + "lat/.zarray": { + "chunks": [ + 2, + 3 + ], + "compressor": { + "blocksize": 0, + "clevel": 5, + "cname": "lz4", + "id": "blosc", + "shuffle": 1 + }, + "dtype": "