From e290bcbd188941221e7e410c86cb10eacb3393f0 Mon Sep 17 00:00:00 2001 From: Alex Leith Date: Wed, 29 Jun 2022 11:48:48 +1000 Subject: [PATCH 1/3] Add Black linting to CI --- .github/workflows/test.yml | 11 +++++++++++ setup.py | 4 +++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 43364d6..9789888 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -15,19 +15,30 @@ jobs: python-version: [ '3.5', '3.8' ] steps: - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} + - name: Install dependencies run: | python -m pip install --upgrade pip pip install "numpy<1.19.0" pip install -r test_requirements.txt pip install pytest-cov + + - name: Run Black check for formatting + if: ${{ matrix.python-version != '3.5' }} + run: | + # Need to manually install Black while we support Python 3.5 + pip install black + black --check . + - name: Test with pytest run: | pytest --cov=./ --cov-report=xml + - name: Upload coverage to Codecov uses: codecov/codecov-action@v1 with: diff --git a/setup.py b/setup.py index be44198..4a2ab09 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,9 @@ ] TESTS_REQUIRE = [ - 'pytest' + 'pytest', + # Need to manually install Black while we support Python 3.5 + # 'black' ] EXTRAS_REQUIRE = { From 41a85534f6a55b1f91b5dd0a2bf688c45161963d Mon Sep 17 00:00:00 2001 From: Alex Leith Date: Thu, 18 Aug 2022 17:44:30 +1000 Subject: [PATCH 2/3] Update gitignore to include hypothesis and build directories --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index eef60be..d32c0cb 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,10 @@ # Python files *.pyc *.egg-info +build # PyCharm settings .idea + +# vscode settings +.hypothesis From 7e5fea4918981b213148be32ceaa5dbc7db2311b Mon Sep 17 00:00:00 2001 From: Alex Leith Date: Thu, 18 Aug 2022 17:45:40 +1000 Subject: [PATCH 3/3] Reformat all Python files using Black --- aodntools/__init__.py | 2 +- aodntools/ncwriter/__init__.py | 12 +- aodntools/ncwriter/imos_template.py | 45 +- aodntools/ncwriter/schema.py | 29 +- aodntools/ncwriter/template.py | 163 +++-- .../aggregated_timeseries.py | 428 ++++++++----- aodntools/timeseries_products/common.py | 109 ++-- .../timeseries_products/gridded_timeseries.py | 362 +++++++---- .../timeseries_products/hourly_timeseries.py | 581 ++++++++++++------ .../velocity_aggregated_timeseries.py | 327 ++++++---- .../velocity_hourly_timeseries.py | 393 +++++++----- examples/rottnest.py | 37 +- setup.py | 60 +- test_aodntools/base_test.py | 56 +- test_aodntools/ncwriter/test_imos_template.py | 99 +-- test_aodntools/ncwriter/test_schema.py | 115 ++-- test_aodntools/ncwriter/test_template.py | 431 +++++++------ .../test_aggregated_timeseries.py | 130 ++-- .../timeseries_products/test_common.py | 118 ++-- .../test_hourly_timeseries.py | 207 ++++--- .../test_velocity_aggregated_timeseries.py | 70 ++- .../test_velocity_hourly_timeseries.py | 71 ++- 22 files changed, 2462 insertions(+), 1383 deletions(-) diff --git a/aodntools/__init__.py b/aodntools/__init__.py index c57bfd5..6c8e6b9 100644 --- a/aodntools/__init__.py +++ b/aodntools/__init__.py @@ -1 +1 @@ -__version__ = '0.0.0' +__version__ = "0.0.0" diff --git a/aodntools/ncwriter/__init__.py b/aodntools/ncwriter/__init__.py index cb90e86..9d4d6e7 100644 --- a/aodntools/ncwriter/__init__.py +++ b/aodntools/ncwriter/__init__.py @@ -3,10 +3,10 @@ from .imos_template import ImosTemplate, TIMESTAMP_FORMAT __all__ = [ - 'ImosTemplate', - 'DatasetTemplate', - 'ValidationError', - 'metadata_attributes', - 'special_attributes', - 'TIMESTAMP_FORMAT' + "ImosTemplate", + "DatasetTemplate", + "ValidationError", + "metadata_attributes", + "special_attributes", + "TIMESTAMP_FORMAT", ] diff --git a/aodntools/ncwriter/imos_template.py b/aodntools/ncwriter/imos_template.py index 51a97cc..eee9619 100644 --- a/aodntools/ncwriter/imos_template.py +++ b/aodntools/ncwriter/imos_template.py @@ -5,10 +5,10 @@ from .template import DatasetTemplate -IMOS_GLOBAL_JSON = resource_filename(__name__, 'imos_global.json') +IMOS_GLOBAL_JSON = resource_filename(__name__, "imos_global.json") IMOS_GLOBAL_ATTRIBUTES = DatasetTemplate.from_json(IMOS_GLOBAL_JSON).global_attributes -TIMESTAMP_FORMAT = '%Y-%m-%dT%H:%M:%SZ' +TIMESTAMP_FORMAT = "%Y-%m-%dT%H:%M:%SZ" class ImosTemplate(DatasetTemplate): @@ -19,7 +19,9 @@ def __init__(self, global_attributes=None, *args, **kwargs): combined_attributes = IMOS_GLOBAL_ATTRIBUTES.copy() if global_attributes is not None: combined_attributes.update(global_attributes) - super(ImosTemplate, self).__init__(global_attributes=combined_attributes, *args, **kwargs) + super(ImosTemplate, self).__init__( + global_attributes=combined_attributes, *args, **kwargs + ) self._date_created = None @property @@ -30,9 +32,13 @@ def date_created(self): return self._date_created def add_date_created_attribute(self): - self.global_attributes['date_created'] = self.date_created.strftime(TIMESTAMP_FORMAT) + self.global_attributes["date_created"] = self.date_created.strftime( + TIMESTAMP_FORMAT + ) - def add_extent_attributes(self, time_var='TIME', vert_var='DEPTH', lat_var='LATITUDE', lon_var='LONGITUDE'): + def add_extent_attributes( + self, time_var="TIME", vert_var="DEPTH", lat_var="LATITUDE", lon_var="LONGITUDE" + ): """ Calculate spatial and temporal extents from coordinate variables in the template and add/update the relevant global attributes. Set an input variable name to None to skip that coordinate. @@ -46,28 +52,33 @@ def add_extent_attributes(self, time_var='TIME', vert_var='DEPTH', lat_var='LATI if time_var: data_range = self.get_data_range(time_var) time = self.variables[time_var] - units = time.get('units') + units = time.get("units") if not units: - raise ValueError("Time variable '{time_var}' has no units".format(time_var=time_var)) - calendar = time.get('calendar', 'gregorian') + raise ValueError( + "Time variable '{time_var}' has no units".format(time_var=time_var) + ) + calendar = time.get("calendar", "gregorian") time_range = num2date(data_range, units, calendar) - self.global_attributes['time_coverage_start'] = time_range[0].strftime(TIMESTAMP_FORMAT) - self.global_attributes['time_coverage_end'] = time_range[1].strftime(TIMESTAMP_FORMAT) + self.global_attributes["time_coverage_start"] = time_range[0].strftime( + TIMESTAMP_FORMAT + ) + self.global_attributes["time_coverage_end"] = time_range[1].strftime( + TIMESTAMP_FORMAT + ) if vert_var: vmin, vmax = self.get_data_range(vert_var) - self.global_attributes['geospatial_vertical_min'] = vmin - self.global_attributes['geospatial_vertical_max'] = vmax + self.global_attributes["geospatial_vertical_min"] = vmin + self.global_attributes["geospatial_vertical_max"] = vmax if lat_var: vmin, vmax = self.get_data_range(lat_var) - self.global_attributes['geospatial_lat_min'] = vmin - self.global_attributes['geospatial_lat_max'] = vmax + self.global_attributes["geospatial_lat_min"] = vmin + self.global_attributes["geospatial_lat_max"] = vmax if lon_var: vmin, vmax = self.get_data_range(lon_var) - self.global_attributes['geospatial_lon_min'] = vmin - self.global_attributes['geospatial_lon_max'] = vmax + self.global_attributes["geospatial_lon_min"] = vmin + self.global_attributes["geospatial_lon_max"] = vmax # TODO: def set_imos_filename(self): - diff --git a/aodntools/ncwriter/schema.py b/aodntools/ncwriter/schema.py index 4a2dd57..b194d9b 100644 --- a/aodntools/ncwriter/schema.py +++ b/aodntools/ncwriter/schema.py @@ -12,12 +12,13 @@ # Create a new validator class (based on Draft4Validator) to allow templates to use # * Python types or numpy dtypes to specify variable data types; and # * numpy arrays to specify variable data. -TemplateValidator = validators.create(meta_schema=Draft4Validator.META_SCHEMA, - validators=Draft4Validator.VALIDATORS) +TemplateValidator = validators.create( + meta_schema=Draft4Validator.META_SCHEMA, validators=Draft4Validator.VALIDATORS +) format_checker = FormatChecker() -@format_checker.checks('datatype') +@format_checker.checks("datatype") def is_python_datatype(value): """Return whether the given value is a valid data type specification for a NetCDF variable""" if isinstance(value, np.dtype): @@ -28,14 +29,16 @@ def is_python_datatype(value): return False -TYPES = {'array': (list, np.ndarray)} +TYPES = {"array": (list, np.ndarray)} -TEMPLATE_SCHEMA_JSON = resource_filename(__name__, 'template_schema.json') +TEMPLATE_SCHEMA_JSON = resource_filename(__name__, "template_schema.json") with open(TEMPLATE_SCHEMA_JSON) as f: TEMPLATE_SCHEMA = json.load(f) TemplateValidator.check_schema(TEMPLATE_SCHEMA) -template_validator = TemplateValidator(TEMPLATE_SCHEMA, types=TYPES, format_checker=format_checker) +template_validator = TemplateValidator( + TEMPLATE_SCHEMA, types=TYPES, format_checker=format_checker +) def validate_template(t): @@ -43,16 +46,20 @@ def validate_template(t): def validate_dimensions(d): - validate_template({'_dimensions': d}) + validate_template({"_dimensions": d}) def validate_variables(v): - validate_template({'_variables': v}) + validate_template({"_variables": v}) def validate_global_attributes(a): - if hasattr(a, 'keys'): - special = [k for k in a.keys() if k.startswith('_')] + if hasattr(a, "keys"): + special = [k for k in a.keys() if k.startswith("_")] if special: - raise ValidationError('Special attributes {} not allowed in global attributes dict'.format(special)) + raise ValidationError( + "Special attributes {} not allowed in global attributes dict".format( + special + ) + ) template_validator.validate(a) diff --git a/aodntools/ncwriter/template.py b/aodntools/ncwriter/template.py index 62c6c45..1d86df0 100644 --- a/aodntools/ncwriter/template.py +++ b/aodntools/ncwriter/template.py @@ -17,7 +17,12 @@ import netCDF4 import numpy as np -from .schema import validate_dimensions, validate_variables, validate_global_attributes, ValidationError +from .schema import ( + validate_dimensions, + validate_variables, + validate_global_attributes, + ValidationError, +) def metadata_attributes(attr): @@ -49,41 +54,39 @@ def special_attributes(attr): """ meta = attr.__class__() for k, v in attr.items(): - if k[0].startswith('_'): + if k[0].startswith("_"): meta[k[1:]] = v return meta class NetCDFGroupDict(object): - def __init__(self, - dimensions=None, - variables=None, - global_attributes=None, - **kwargs): - """ A dictionary to hold netCDF groups - It consist of a generic class holding 3 different dictionaries: - dimensions is a dict - variables is dict - global_attributes is a dict - - This class has __add__ to combine variables/dimensions/global attributes - from :NetCDFGroupDict: instances. - - Example: - dmn = {'lon':360,'lat':210} - var = {} - var['water'] = {'_datatype':'double','_dimensions':['lat','lon']} - w1 = NetCDFGroupDict(dimensions=dmn,variables=var) - - dmn2 = {'time':300,'lon':720,'lat':330} - var2 = {} - var2['temp'] = {'_datatype':'double','_dimensions':['time','lat','lon']} - w2 = NetCDFGroupDict(dimensions=dmn2,variables=var2) - - w3 = w1+w2 - #w3.variables.keys() = ['water','temp'] - #w3.dimensions = {'time':300,'lon':360,'lat':210} + def __init__( + self, dimensions=None, variables=None, global_attributes=None, **kwargs + ): + """A dictionary to hold netCDF groups + It consist of a generic class holding 3 different dictionaries: + dimensions is a dict + variables is dict + global_attributes is a dict + + This class has __add__ to combine variables/dimensions/global attributes + from :NetCDFGroupDict: instances. + + Example: + dmn = {'lon':360,'lat':210} + var = {} + var['water'] = {'_datatype':'double','_dimensions':['lat','lon']} + w1 = NetCDFGroupDict(dimensions=dmn,variables=var) + + dmn2 = {'time':300,'lon':720,'lat':330} + var2 = {} + var2['temp'] = {'_datatype':'double','_dimensions':['time','lat','lon']} + w2 = NetCDFGroupDict(dimensions=dmn2,variables=var2) + + w3 = w1+w2 + #w3.variables.keys() = ['water','temp'] + #w3.dimensions = {'time':300,'lon':360,'lat':210} """ self._dimensions = None self._variables = None @@ -148,9 +151,19 @@ def validate_template_schema(self): class DatasetTemplate(NetCDFGroupDict): """Template object used for creating netCDF files""" - STRUCTURAL_ATTRIBUTES = {'datatype', 'dimensions', 'zlib', 'complevel', 'shuffle', 'fletcher32', 'contiguous', - 'chunksizes', 'endian', 'least_significant_digit'} - FILL_VALUE_ALIASES = {'fill_value', 'FillValue'} + STRUCTURAL_ATTRIBUTES = { + "datatype", + "dimensions", + "zlib", + "complevel", + "shuffle", + "fletcher32", + "contiguous", + "chunksizes", + "endian", + "least_significant_digit", + } + FILL_VALUE_ALIASES = {"fill_value", "FillValue"} def __init__(self, *args, **kwargs): super(DatasetTemplate, self).__init__(*args, **kwargs) @@ -165,12 +178,15 @@ def from_json(cls, path): try: template = json.load(f, object_pairs_hook=OrderedDict) except ValueError as e: - raise ValueError("invalid JSON file '{path}' ({e})".format(path=path, e=e)) + raise ValueError( + "invalid JSON file '{path}' ({e})".format(path=path, e=e) + ) - return cls(dimensions=template.get('_dimensions'), - variables=template.get('_variables'), - global_attributes=metadata_attributes(template) - ) + return cls( + dimensions=template.get("_dimensions"), + variables=template.get("_variables"), + global_attributes=metadata_attributes(template), + ) def ensure_completeness(self): """Ensure that all variables have all the necessary information to create a netCDF file. @@ -186,17 +202,27 @@ def ensure_completeness(self): var["_dimensions"] = [] if "_data" not in var: - raise ValidationError("No data specified for variable '{name}'".format(name=name)) + raise ValidationError( + "No data specified for variable '{name}'".format(name=name) + ) if var["_data"] is not None and not isinstance(var["_data"], np.ndarray): var["_data"] = np.array(var["_data"]) if "_datatype" not in var: - datatype = getattr(var["_data"], 'dtype', None) + datatype = getattr(var["_data"], "dtype", None) if datatype is not None: - warn("Guessed data type '{datatype}' for variable '{name}'".format(datatype=datatype, name=name)) + warn( + "Guessed data type '{datatype}' for variable '{name}'".format( + datatype=datatype, name=name + ) + ) var["_datatype"] = datatype else: - raise ValidationError("No data type information for variable '{name}'".format(name=name)) + raise ValidationError( + "No data type information for variable '{name}'".format( + name=name + ) + ) def ensure_consistency(self): """For each variable, ensure that the specified dimensions and data arrays are consistent with each other, and @@ -217,15 +243,18 @@ def ensure_consistency(self): # TODO: check for "unused" dimensions? for name, var in self.variables.items(): # check dimensions exist - var_dims = var['_dimensions'] or [] + var_dims = var["_dimensions"] or [] inconsistent_dims = set(var_dims).difference(self.dimensions) if inconsistent_dims: - raise ValidationError("Variable '{name}' has undefined dimensions " - "{inconsistent_dims}".format(name=name, inconsistent_dims=inconsistent_dims) - ) + raise ValidationError( + "Variable '{name}' has undefined dimensions " + "{inconsistent_dims}".format( + name=name, inconsistent_dims=inconsistent_dims + ) + ) # if we have no data array, can't do any more - values = var.get('_data') + values = var.get("_data") if values is None: continue @@ -234,7 +263,8 @@ def ensure_consistency(self): if len(var_shape) != len(var_dims): raise ValueError( "Variable '{name}' has {ndim} dimensions, but value array has {nshape} dimensions.".format( - name=name, ndim=len(var_dims), nshape=len(var_shape)) + name=name, ndim=len(var_dims), nshape=len(var_shape) + ) ) # adjust dimension size if not already set @@ -248,7 +278,10 @@ def ensure_consistency(self): raise ValueError( "Variable '{name}' has dimensions {var_dims} and shape {var_shape}, inconsistent with dimension " "sizes defined in template {template_shape}".format( - name=name, var_dims=var_dims, var_shape=var_shape, template_shape=template_shape + name=name, + var_dims=var_dims, + var_shape=var_shape, + template_shape=template_shape, ) ) @@ -282,7 +315,7 @@ def _create_var_opts(self, vname, vdict): var_opts = {k: special_dict[k] for k in struct_keys} if fill_aliases: - var_opts['fill_value'] = special_dict[fill_aliases.pop()] + var_opts["fill_value"] = special_dict[fill_aliases.pop()] return var_opts def create_dimensions(self): @@ -298,11 +331,11 @@ def create_variables(self, **kwargs): # variable attributes to convert to the same type as the variable # datatype - varattrs_to_convert_to_datatype = ['valid_min', 'valid_max', 'valid_range'] + varattrs_to_convert_to_datatype = ["valid_min", "valid_max", "valid_range"] for varname, varattr in self.variables.items(): - if not varattr['_dimensions']: # no kwargs in createVariable - ncvar = self.ncobj.createVariable(varname, varattr['_datatype']) + if not varattr["_dimensions"]: # no kwargs in createVariable + ncvar = self.ncobj.createVariable(varname, varattr["_datatype"]) else: var_c_opts = self._create_var_opts(varname, varattr) var_c_opts.update(kwargs) @@ -310,13 +343,15 @@ def create_variables(self, **kwargs): ncvar = self.ncobj.createVariable(varname, **var_c_opts) # add variable values - if varattr['_data'] is not None: - ncvar[:] = varattr['_data'] + if varattr["_data"] is not None: + ncvar[:] = varattr["_data"] # convert some variables attribute to variable datatype for varattr_to_convert in varattrs_to_convert_to_datatype: if varattr_to_convert in varattr.keys(): - varattr[varattr_to_convert] = np.array(varattr[varattr_to_convert], dtype=varattr['_datatype']) + varattr[varattr_to_convert] = np.array( + varattr[varattr_to_convert], dtype=varattr["_datatype"] + ) # add variable attributes ncvar.setncatts(metadata_attributes(varattr)) @@ -344,7 +379,7 @@ def to_netcdf(self, outfile, var_args=None, **kwargs): self.ensure_consistency() try: - self.ncobj = netCDF4.Dataset(self.outfile, mode='w', **kwargs) + self.ncobj = netCDF4.Dataset(self.outfile, mode="w", **kwargs) self.create_dimensions() self.create_variables(**_var_args) self.create_global_attributes() @@ -354,7 +389,7 @@ def to_netcdf(self, outfile, var_args=None, **kwargs): finally: self.ncobj.close() - self.ncobj = netCDF4.Dataset(self.outfile, 'a') + self.ncobj = netCDF4.Dataset(self.outfile, "a") def get_data_range(self, varname): """ @@ -367,17 +402,21 @@ def get_data_range(self, varname): """ var = self.variables.get(varname) if var is None: - raise ValueError("Variable '{varname}' does not exist".format(varname=varname)) - data = var.get('_data', []) + raise ValueError( + "Variable '{varname}' does not exist".format(varname=varname) + ) + data = var.get("_data", []) # mask out the fillvalues - fill_value = var.get('_FillValue') or var.get('_fill_value') + fill_value = var.get("_FillValue") or var.get("_fill_value") if fill_value is None: mask = np.isnan(data) else: mask = np.logical_or(data == fill_value, np.isnan(data)) data_masked = np.ma.array(data, mask=mask) if data_masked.mask.all(): - raise ValueError("No valid data for variable '{varname}'".format(varname=varname)) + raise ValueError( + "No valid data for variable '{varname}'".format(varname=varname) + ) return data_masked.min(), data_masked.max() diff --git a/aodntools/timeseries_products/aggregated_timeseries.py b/aodntools/timeseries_products/aggregated_timeseries.py index cda7380..372ad09 100644 --- a/aodntools/timeseries_products/aggregated_timeseries.py +++ b/aodntools/timeseries_products/aggregated_timeseries.py @@ -13,13 +13,19 @@ from pkg_resources import resource_filename from aodntools import __version__ -from aodntools.timeseries_products.common import (NoInputFilesError, check_file, in_water, current_utc_timestamp, - TIMESTAMP_FORMAT, DATESTAMP_FORMAT) +from aodntools.timeseries_products.common import ( + NoInputFilesError, + check_file, + in_water, + current_utc_timestamp, + TIMESTAMP_FORMAT, + DATESTAMP_FORMAT, +) -TEMPLATE_JSON = resource_filename(__name__, 'aggregated_timeseries_template.json') +TEMPLATE_JSON = resource_filename(__name__, "aggregated_timeseries_template.json") -def sort_files(files_to_agg, input_dir=''): +def sort_files(files_to_agg, input_dir=""): """ sort list of files according to deployment date :param files_to_agg: List of files to sort @@ -30,7 +36,7 @@ def sort_files(files_to_agg, input_dir=''): time_start = [] for file in files_to_agg: with Dataset(os.path.join(input_dir, file)) as ds: - time_start.append(np.datetime64(ds.time_deployment_start.rstrip('Z'))) + time_start.append(np.datetime64(ds.time_deployment_start.rstrip("Z"))) tuples = sorted(zip(time_start, files_to_agg)) return [t[1] for t in tuples] @@ -50,9 +56,11 @@ def get_variable_values(nc, variable): if variable in file_variables: variable_values = nc[variable].values if any(np.isnan(variable_values)): - variable_values = np.ma.masked_array(variable_values, mask=np.isnan(variable_values)) - if variable+'_quality_control' in file_variables: - variableQC_values = nc[variable+'_quality_control'].values + variable_values = np.ma.masked_array( + variable_values, mask=np.isnan(variable_values) + ) + if variable + "_quality_control" in file_variables: + variableQC_values = nc[variable + "_quality_control"].values else: variableQC_values = 0 else: @@ -68,10 +76,12 @@ def get_instrument_id(nc): :param nc: xarray dataset :return: instrumentID as string """ - deployment_code = nc.attrs.get('deployment_code', '[unknown deployment]') - instrument = nc.attrs.get('instrument', '[unknown instrument]') - instrument_serial_number = nc.attrs.get('instrument_serial_number', '[unknown serial number]') - return '; '.join([deployment_code, instrument, instrument_serial_number]) + deployment_code = nc.attrs.get("deployment_code", "[unknown deployment]") + instrument = nc.attrs.get("instrument", "[unknown instrument]") + instrument_serial_number = nc.attrs.get( + "instrument_serial_number", "[unknown serial number]" + ) + return "; ".join([deployment_code, instrument, instrument_serial_number]) def get_nominal_depth(nc): @@ -83,7 +93,7 @@ def get_nominal_depth(nc): :return: nominal depth of the instrument """ - if 'NOMINAL_DEPTH' in list(nc.variables): + if "NOMINAL_DEPTH" in list(nc.variables): nominal_depth = nc.NOMINAL_DEPTH.squeeze().values else: nominal_depth = nc.instrument_nominal_depth @@ -91,7 +101,7 @@ def get_nominal_depth(nc): return nominal_depth -def get_contributors(files_to_agg, input_dir=''): +def get_contributors(files_to_agg, input_dir=""): """ get the author and principal investigator details for each file @@ -106,19 +116,29 @@ def get_contributors(files_to_agg, input_dir=''): for file in files_to_agg: with xr.open_dataset(os.path.join(input_dir, file)) as nc: attributes = nc.attrs.keys() - if all(att in attributes for att in ['author', 'author_email']): - contributors.add((nc.author, nc.author_email, 'author')) - if all(att in attributes for att in ['principal_investigator', 'principal_investigator_email']): - contributors.add((nc.principal_investigator, nc.principal_investigator_email, 'principal_investigator')) + if all(att in attributes for att in ["author", "author_email"]): + contributors.add((nc.author, nc.author_email, "author")) + if all( + att in attributes + for att in ["principal_investigator", "principal_investigator_email"] + ): + contributors.add( + ( + nc.principal_investigator, + nc.principal_investigator_email, + "principal_investigator", + ) + ) for item in contributors: contributor_name.append(item[0]) contributor_email.append(item[1]) contributor_role.append(item[2]) - return {'contributor_name': "; ".join(contributor_name), - 'contributor_email': "; ".join(contributor_email), - 'contributor_role': "; ".join(contributor_role) - } + return { + "contributor_name": "; ".join(contributor_name), + "contributor_email": "; ".join(contributor_email), + "contributor_role": "; ".join(contributor_role), + } def get_data_code(VoI): @@ -129,27 +149,29 @@ def get_data_code(VoI): :return: variable data code """ - #dictionary of data code. could be read from external file - dataCodes = {'DEPTH': 'Z', - 'PRES': 'Z', - 'PRES_REL': 'Z', - 'TEMP': 'T', - 'PSAL': 'S', - 'PAR': 'F', - 'TURB': 'U', - 'TURBF': 'U', - 'DOX1': 'O', - 'DOX1_2': 'O', - 'DOX1_3': 'O', - 'DOX2': 'O', - 'DOX2_1': 'O', - 'DOXS': 'O', - 'CPHL': 'B', - 'CHLU': 'B', - 'CHLF': 'B', - 'UCUR': 'V', - 'VCUR': 'V', - 'WCUR': 'V'} + # dictionary of data code. could be read from external file + dataCodes = { + "DEPTH": "Z", + "PRES": "Z", + "PRES_REL": "Z", + "TEMP": "T", + "PSAL": "S", + "PAR": "F", + "TURB": "U", + "TURBF": "U", + "DOX1": "O", + "DOX1_2": "O", + "DOX1_3": "O", + "DOX2": "O", + "DOX2_1": "O", + "DOXS": "O", + "CPHL": "B", + "CHLU": "B", + "CHLF": "B", + "UCUR": "V", + "VCUR": "V", + "WCUR": "V", + } return dataCodes[VoI] @@ -171,22 +193,34 @@ def source_file_attributes(download_url_prefix, opendap_url_prefix): :param opendap_url_prefix: prefix string for OPENDAP URLs :return: dictionary of attributes to add to the source_file variable """ - attributes = {'comment': "This variable lists the relative path of each input file."} + attributes = { + "comment": "This variable lists the relative path of each input file." + } if download_url_prefix: - attributes['comment'] += (" To obtain a download URL for a file, " - "append its path to the download_url_prefix attribute.") - attributes['download_url_prefix'] = download_url_prefix + attributes["comment"] += ( + " To obtain a download URL for a file, " + "append its path to the download_url_prefix attribute." + ) + attributes["download_url_prefix"] = download_url_prefix if opendap_url_prefix: - attributes['comment'] += (" To interact with the file remotely via the OPENDAP protocol, " - "append its path to the opendap_url_prefix attribute.") - attributes['opendap_url_prefix'] = opendap_url_prefix + attributes["comment"] += ( + " To interact with the file remotely via the OPENDAP protocol, " + "append its path to the opendap_url_prefix attribute." + ) + attributes["opendap_url_prefix"] = opendap_url_prefix return attributes - ## MAIN FUNCTION -def main_aggregator(files_to_agg, var_to_agg, site_code, input_dir='', output_dir='./', - download_url_prefix=None, opendap_url_prefix=None): +def main_aggregator( + files_to_agg, + var_to_agg, + site_code, + input_dir="", + output_dir="./", + download_url_prefix=None, + opendap_url_prefix=None, +): """ Aggregate the Variable of Interest (VoI) from all deployments at one site. additional metadata variables are stored to track the origin of the data @@ -201,16 +235,16 @@ def main_aggregator(files_to_agg, var_to_agg, site_code, input_dir='', output_di :return: name of the resulting file, list of rejected files """ - time_units="days since 1950-01-01 00:00:00 UTC" - time_calendar="gregorian" + time_units = "days since 1950-01-01 00:00:00 UTC" + time_calendar = "gregorian" epoch = np.datetime64("1950-01-01T00:00:00") - one_day = np.timedelta64(1, 'D') + one_day = np.timedelta64(1, "D") bad_files = {} rejected_files = [] # default name for temporary file. It will be renamed at the end - _, temp_outfile = tempfile.mkstemp(suffix='.nc', dir=output_dir) + _, temp_outfile = tempfile.mkstemp(suffix=".nc", dir=output_dir) ## check files and get total number of flattened obs n_obs_total = 0 @@ -236,37 +270,67 @@ def main_aggregator(files_to_agg, var_to_agg, site_code, input_dir='', output_di n_files = len(files_to_agg) ## create ncdf file, dimensions and variables - ds = Dataset(os.path.join(output_dir, temp_outfile), 'w', format='NETCDF4_CLASSIC') - OBSERVATION = ds.createDimension('OBSERVATION', size=n_obs_total) - INSTRUMENT = ds.createDimension('INSTRUMENT', size=n_files) + ds = Dataset(os.path.join(output_dir, temp_outfile), "w", format="NETCDF4_CLASSIC") + OBSERVATION = ds.createDimension("OBSERVATION", size=n_obs_total) + INSTRUMENT = ds.createDimension("INSTRUMENT", size=n_files) STRING256 = ds.createDimension("strlen", 256) - - obs_float_template = {'datatype': np.float32, 'zlib': True, 'dimensions': ('OBSERVATION',), "fill_value": 99999.0} - obs_double_template = {'datatype': np.float64, 'zlib': True, 'dimensions': ('OBSERVATION',), "fill_value": 99999.0} - obs_byte_template = {'datatype': np.byte, 'zlib': True, 'dimensions': ('OBSERVATION',), 'fill_value': 99} - obs_int_template = {'datatype': np.int16, 'zlib': True, 'dimensions': ('OBSERVATION',)} - inst_S256_template = {'datatype': 'S1', 'dimensions': ('INSTRUMENT', "strlen")} - inst_float_template = {'datatype': np.float32, 'dimensions': ('INSTRUMENT',), "fill_value": 99999.0} - inst_double_template = {'datatype': np.float64, 'dimensions': ('INSTRUMENT',), "fill_value": 99999.0} + obs_float_template = { + "datatype": np.float32, + "zlib": True, + "dimensions": ("OBSERVATION",), + "fill_value": 99999.0, + } + obs_double_template = { + "datatype": np.float64, + "zlib": True, + "dimensions": ("OBSERVATION",), + "fill_value": 99999.0, + } + obs_byte_template = { + "datatype": np.byte, + "zlib": True, + "dimensions": ("OBSERVATION",), + "fill_value": 99, + } + obs_int_template = { + "datatype": np.int16, + "zlib": True, + "dimensions": ("OBSERVATION",), + } + inst_S256_template = {"datatype": "S1", "dimensions": ("INSTRUMENT", "strlen")} + inst_float_template = { + "datatype": np.float32, + "dimensions": ("INSTRUMENT",), + "fill_value": 99999.0, + } + inst_double_template = { + "datatype": np.float64, + "dimensions": ("INSTRUMENT",), + "fill_value": 99999.0, + } agg_variable = ds.createVariable(varname=var_to_agg, **obs_float_template) - agg_variable_qc = ds.createVariable(varname=var_to_agg + '_quality_control', **obs_byte_template) - DEPTH = ds.createVariable(varname='DEPTH', **obs_float_template) - DEPTHqc = ds.createVariable(varname='DEPTH_quality_control', **obs_byte_template) - PRES = ds.createVariable(varname='PRES', **obs_float_template) - PRESqc = ds.createVariable(varname='PRES_quality_control', **obs_byte_template) - PRES_REL = ds.createVariable(varname='PRES_REL', **obs_float_template) - PRES_RELqc = ds.createVariable(varname='PRES_REL_quality_control', **obs_byte_template) - - TIME = ds.createVariable(varname='TIME', **obs_double_template) - instrument_index = ds.createVariable(varname='instrument_index', **obs_int_template) - - source_file = ds.createVariable(varname='source_file', **inst_S256_template) - instrument_id = ds.createVariable(varname='instrument_id', **inst_S256_template) - LATITUDE = ds.createVariable(varname='LATITUDE', **inst_double_template) - LONGITUDE = ds.createVariable(varname='LONGITUDE', **inst_double_template) - NOMINAL_DEPTH = ds.createVariable(varname='NOMINAL_DEPTH', **inst_float_template) + agg_variable_qc = ds.createVariable( + varname=var_to_agg + "_quality_control", **obs_byte_template + ) + DEPTH = ds.createVariable(varname="DEPTH", **obs_float_template) + DEPTHqc = ds.createVariable(varname="DEPTH_quality_control", **obs_byte_template) + PRES = ds.createVariable(varname="PRES", **obs_float_template) + PRESqc = ds.createVariable(varname="PRES_quality_control", **obs_byte_template) + PRES_REL = ds.createVariable(varname="PRES_REL", **obs_float_template) + PRES_RELqc = ds.createVariable( + varname="PRES_REL_quality_control", **obs_byte_template + ) + + TIME = ds.createVariable(varname="TIME", **obs_double_template) + instrument_index = ds.createVariable(varname="instrument_index", **obs_int_template) + + source_file = ds.createVariable(varname="source_file", **inst_S256_template) + instrument_id = ds.createVariable(varname="instrument_id", **inst_S256_template) + LATITUDE = ds.createVariable(varname="LATITUDE", **inst_double_template) + LONGITUDE = ds.createVariable(varname="LONGITUDE", **inst_double_template) + NOMINAL_DEPTH = ds.createVariable(varname="NOMINAL_DEPTH", **inst_float_template) ## main loop start = 0 @@ -275,10 +339,14 @@ def main_aggregator(files_to_agg, var_to_agg, site_code, input_dir='', output_di nc = in_water(nc) n_obs = len(nc.TIME) end = start + n_obs - agg_variable[start:end], agg_variable_qc[start:end] = get_variable_values(nc, var_to_agg) - DEPTH[start:end], DEPTHqc[start:end] = get_variable_values(nc, 'DEPTH') - PRES[start:end], PRESqc[start:end] = get_variable_values(nc, 'PRES') - PRES_REL[start:end], PRES_RELqc[start:end] = get_variable_values(nc, 'PRES_REL') + agg_variable[start:end], agg_variable_qc[start:end] = get_variable_values( + nc, var_to_agg + ) + DEPTH[start:end], DEPTHqc[start:end] = get_variable_values(nc, "DEPTH") + PRES[start:end], PRESqc[start:end] = get_variable_values(nc, "PRES") + PRES_REL[start:end], PRES_RELqc[start:end] = get_variable_values( + nc, "PRES_REL" + ) ## set TIME and instrument index TIME[start:end] = (nc.TIME.values - epoch) / one_day @@ -287,54 +355,79 @@ def main_aggregator(files_to_agg, var_to_agg, site_code, input_dir='', output_di LATITUDE[index] = nc.LATITUDE.values LONGITUDE[index] = nc.LONGITUDE.values NOMINAL_DEPTH[index] = get_nominal_depth(nc) - source_file[index] = stringtochar(np.array(file, dtype='S256')) - instrument_id[index] = stringtochar(np.array(get_instrument_id(nc), dtype='S256')) + source_file[index] = stringtochar(np.array(file, dtype="S256")) + instrument_id[index] = stringtochar( + np.array(get_instrument_id(nc), dtype="S256") + ) start = end - ## add atributes with open(TEMPLATE_JSON) as json_file: attribute_dictionary = json.load(json_file) - variable_attribute_dictionary = attribute_dictionary['_variables'] - global_attribute_dictionary = attribute_dictionary['_global'] + variable_attribute_dictionary = attribute_dictionary["_variables"] + global_attribute_dictionary = attribute_dictionary["_global"] ## set variable attrs for var in list(ds.variables): ds[var].setncatts(variable_attribute_dictionary[var]) if download_url_prefix or opendap_url_prefix: - ds['source_file'].setncatts(source_file_attributes(download_url_prefix, opendap_url_prefix)) + ds["source_file"].setncatts( + source_file_attributes(download_url_prefix, opendap_url_prefix) + ) ## set global attrs - time_start = num2date(np.min(TIME[:]), time_units, time_calendar).strftime(TIMESTAMP_FORMAT) - time_end = num2date(np.max(TIME[:]), time_units, time_calendar).strftime(TIMESTAMP_FORMAT) - time_start_filename = num2date(np.min(TIME[:]), time_units, time_calendar).strftime(DATESTAMP_FORMAT) - time_end_filename = num2date(np.max(TIME[:]), time_units, time_calendar).strftime(DATESTAMP_FORMAT) + time_start = num2date(np.min(TIME[:]), time_units, time_calendar).strftime( + TIMESTAMP_FORMAT + ) + time_end = num2date(np.max(TIME[:]), time_units, time_calendar).strftime( + TIMESTAMP_FORMAT + ) + time_start_filename = num2date(np.min(TIME[:]), time_units, time_calendar).strftime( + DATESTAMP_FORMAT + ) + time_end_filename = num2date(np.max(TIME[:]), time_units, time_calendar).strftime( + DATESTAMP_FORMAT + ) add_attribute = { - 'title': ("Long Timeseries Velocity Aggregated product: " + var_to_agg + " at " + - site_code + " between " + time_start + " and " + time_end), - 'site_code': site_code, - 'time_coverage_start': time_start, - 'time_coverage_end': time_end, - 'geospatial_vertical_min': np.min(ds['DEPTH'][:]), - 'geospatial_vertical_max': np.max(ds['DEPTH'][:]), - 'geospatial_lat_min': np.min(ds['LATITUDE'][:]), - 'geospatial_lat_max': np.max(ds['LATITUDE'][:]), - 'geospatial_lon_min': np.min(ds['LONGITUDE'][:]), - 'geospatial_lon_max': np.max(ds['LONGITUDE'][:]), - 'date_created': current_utc_timestamp(), - 'history': current_utc_timestamp() + ': Aggregated file created.', - 'keywords': ', '.join([var_to_agg, 'AGGREGATED']), - 'rejected_files': "\n".join(rejected_files), - 'generating_code_version': __version__} - add_attribute.update(get_contributors(files_to_agg=files_to_agg, input_dir=input_dir)) - - github_comment = ('\nThis file was created using https://github.com/aodn/python-aodntools/blob/' - '{v}/aodntools/timeseries_products/aggregated_timeseries.py'.format(v=__version__) - ) - global_attribute_dictionary['lineage'] += github_comment + "title": ( + "Long Timeseries Velocity Aggregated product: " + + var_to_agg + + " at " + + site_code + + " between " + + time_start + + " and " + + time_end + ), + "site_code": site_code, + "time_coverage_start": time_start, + "time_coverage_end": time_end, + "geospatial_vertical_min": np.min(ds["DEPTH"][:]), + "geospatial_vertical_max": np.max(ds["DEPTH"][:]), + "geospatial_lat_min": np.min(ds["LATITUDE"][:]), + "geospatial_lat_max": np.max(ds["LATITUDE"][:]), + "geospatial_lon_min": np.min(ds["LONGITUDE"][:]), + "geospatial_lon_max": np.max(ds["LONGITUDE"][:]), + "date_created": current_utc_timestamp(), + "history": current_utc_timestamp() + ": Aggregated file created.", + "keywords": ", ".join([var_to_agg, "AGGREGATED"]), + "rejected_files": "\n".join(rejected_files), + "generating_code_version": __version__, + } + add_attribute.update( + get_contributors(files_to_agg=files_to_agg, input_dir=input_dir) + ) + + github_comment = ( + "\nThis file was created using https://github.com/aodn/python-aodntools/blob/" + "{v}/aodntools/timeseries_products/aggregated_timeseries.py".format( + v=__version__ + ) + ) + global_attribute_dictionary["lineage"] += github_comment global_attribute_dictionary.update(add_attribute) ds.setncatts(dict(sorted(global_attribute_dictionary.items()))) @@ -342,12 +435,25 @@ def main_aggregator(files_to_agg, var_to_agg, site_code, input_dir='', output_di ## create the output file name and rename the tmp file facility_code = get_facility_code(os.path.join(input_dir, files_to_agg[0])) - data_code = get_data_code(var_to_agg) + 'Z' - product_type = 'aggregated-timeseries' + data_code = get_data_code(var_to_agg) + "Z" + product_type = "aggregated-timeseries" file_version = 1 - output_name = '_'.join(['IMOS', facility_code, data_code, time_start_filename, site_code, ('FV0'+str(file_version)), - (var_to_agg + "-" + product_type), - ('END-'+ time_end_filename), 'C-' + current_utc_timestamp(DATESTAMP_FORMAT)]) + '.nc' + output_name = ( + "_".join( + [ + "IMOS", + facility_code, + data_code, + time_start_filename, + site_code, + ("FV0" + str(file_version)), + (var_to_agg + "-" + product_type), + ("END-" + time_end_filename), + "C-" + current_utc_timestamp(DATESTAMP_FORMAT), + ] + ) + + ".nc" + ) ncout_path = os.path.join(output_dir, output_name) shutil.move(temp_outfile, os.path.join(output_dir, ncout_path)) @@ -356,24 +462,62 @@ def main_aggregator(files_to_agg, var_to_agg, site_code, input_dir='', output_di if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Aggregate ONE variable from ALL instruments from ALL deployments from ONE site") - parser.add_argument('-site', dest='site_code', help='site code, like NRMMAI', required=True) - parser.add_argument('-var', dest='varname', help='variable to aggregate, like TEMP', required=True) - parser.add_argument('-files', dest='filenames', - help='name of the file that contains the source URLs (relative to inpath, if given)', - required=True) - parser.add_argument('-indir', dest='input_dir', help='base path of input files', default='', required=False) - parser.add_argument('-outdir', dest='output_dir', help='path where the result file will be written. Default ./', - default='./', required=False) - parser.add_argument('-download_url', dest='download_url', help='path to the download_url_prefix', - default='', required=False) - parser.add_argument('-opendap_url', dest='opendap_url', help='path to the opendap_url_prefix', - default='', required=False) + parser = argparse.ArgumentParser( + description="Aggregate ONE variable from ALL instruments from ALL deployments from ONE site" + ) + parser.add_argument( + "-site", dest="site_code", help="site code, like NRMMAI", required=True + ) + parser.add_argument( + "-var", dest="varname", help="variable to aggregate, like TEMP", required=True + ) + parser.add_argument( + "-files", + dest="filenames", + help="name of the file that contains the source URLs (relative to inpath, if given)", + required=True, + ) + parser.add_argument( + "-indir", + dest="input_dir", + help="base path of input files", + default="", + required=False, + ) + parser.add_argument( + "-outdir", + dest="output_dir", + help="path where the result file will be written. Default ./", + default="./", + required=False, + ) + parser.add_argument( + "-download_url", + dest="download_url", + help="path to the download_url_prefix", + default="", + required=False, + ) + parser.add_argument( + "-opendap_url", + dest="opendap_url", + help="path to the opendap_url_prefix", + default="", + required=False, + ) args = parser.parse_args() - with open(os.path.join(args.input_dir,args.filenames)) as ff: + with open(os.path.join(args.input_dir, args.filenames)) as ff: files_to_agg = [line.rstrip() for line in ff] - print(main_aggregator(files_to_agg=files_to_agg, var_to_agg=args.varname, site_code=args.site_code, - input_dir=args.input_dir, output_dir=args.output_dir, - download_url_prefix=args.download_url, opendap_url_prefix=args.opendap_url)) + print( + main_aggregator( + files_to_agg=files_to_agg, + var_to_agg=args.varname, + site_code=args.site_code, + input_dir=args.input_dir, + output_dir=args.output_dir, + download_url_prefix=args.download_url, + opendap_url_prefix=args.opendap_url, + ) + ) diff --git a/aodntools/timeseries_products/common.py b/aodntools/timeseries_products/common.py index 75c75f2..5e63f14 100644 --- a/aodntools/timeseries_products/common.py +++ b/aodntools/timeseries_products/common.py @@ -4,12 +4,13 @@ import numpy as np # Common date/time format strings -TIMESTAMP_FORMAT = '%Y-%m-%dT%H:%M:%SZ' -DATESTAMP_FORMAT = '%Y%m%d' +TIMESTAMP_FORMAT = "%Y-%m-%dT%H:%M:%SZ" +DATESTAMP_FORMAT = "%Y%m%d" class NoInputFilesError(Exception): """Exception raised if there are no valid input files to aggregate""" + pass @@ -21,7 +22,8 @@ def get_qc_variable_names(nc): :return: list of variable names """ varlist = list(nc.variables) - return [v for v in varlist if v.endswith('_quality_control')] + return [v for v in varlist if v.endswith("_quality_control")] + def check_imos_flag_conventions(nc, varnames=None): """ @@ -41,30 +43,42 @@ def check_imos_flag_conventions(nc, varnames=None): # accept two variants on the convention name, used in versions 1.3 and 1.4 of the # IMOS NetCDF Conventions document - accepted_conventions = {"IMOS standard flags", "IMOS standard set using the IODE flags"} + accepted_conventions = { + "IMOS standard flags", + "IMOS standard set using the IODE flags", + } errors = set() for var in varnames: if var not in nc.variables: - errors.add('variable {var} not in file'.format(var=var)) + errors.add("variable {var} not in file".format(var=var)) continue - conventions = getattr(nc[var], 'quality_control_conventions', None) + conventions = getattr(nc[var], "quality_control_conventions", None) if conventions is None: - errors.add('variable {var} missing quality_control_conventions'.format(var=var)) + errors.add( + "variable {var} missing quality_control_conventions".format(var=var) + ) continue if conventions not in accepted_conventions: - errors.add('unexpected quality_control_conventions: "{conventions}"'.format(conventions=conventions)) + errors.add( + 'unexpected quality_control_conventions: "{conventions}"'.format( + conventions=conventions + ) + ) return sorted(errors) -def check_file(nc, site_code, variables_of_interest, - required_variables=('TIME', 'LATITUDE', 'LONGITUDE'), - allowed_dimensions=('TIME', 'LATITUDE', 'LONGITUDE') - ): +def check_file( + nc, + site_code, + variables_of_interest, + required_variables=("TIME", "LATITUDE", "LONGITUDE"), + allowed_dimensions=("TIME", "LATITUDE", "LONGITUDE"), +): """ Check that a file meets the requirements for inclusion in a product. Return a list of errors - + Checks applied: * Correct site_code * file_version is FV01 @@ -92,46 +106,51 @@ def check_file(nc, site_code, variables_of_interest, variables = set(nc.variables) error_list = [] - if site_code != nc.attrs.get('site_code', '[missing]'): - error_list.append('Wrong site_code: ' + nc.site_code) + if site_code != nc.attrs.get("site_code", "[missing]"): + error_list.append("Wrong site_code: " + nc.site_code) - nc_file_version = nc.attrs.get('file_version', '[missing]') - if 'Level 1' not in nc_file_version: - error_list.append('Wrong file version: ' + nc_file_version) + nc_file_version = nc.attrs.get("file_version", "[missing]") + if "Level 1" not in nc_file_version: + error_list.append("Wrong file version: " + nc_file_version) for var in set(required_variables) - variables: - error_list.append('{var} variable missing'.format(var=var)) + error_list.append("{var} variable missing".format(var=var)) variables_to_aggregate = set(variables_of_interest) & variables if not variables_to_aggregate: - error_list.append('no variables to aggregate') + error_list.append("no variables to aggregate") for var in variables_to_aggregate: dims = set(nc[var].dims) - if 'TIME' not in dims: - error_list.append('no TIME dimension for {}'.format(var)) - if 'LATITUDE' in dims and len(nc.LATITUDE) > 1: - error_list.append('more than one LATITUDE') - if 'LONGITUDE' in dims and len(nc.LONGITUDE) > 1: - error_list.append('more than one LONGITUDE') + if "TIME" not in dims: + error_list.append("no TIME dimension for {}".format(var)) + if "LATITUDE" in dims and len(nc.LATITUDE) > 1: + error_list.append("more than one LATITUDE") + if "LONGITUDE" in dims and len(nc.LONGITUDE) > 1: + error_list.append("more than one LONGITUDE") other_dims = dims - set(allowed_dimensions) if other_dims: error_list.append( - 'dimension(s) {other_dims} not allowed for {var}'.format(other_dims=other_dims, var=var) + "dimension(s) {other_dims} not allowed for {var}".format( + other_dims=other_dims, var=var + ) ) - if 'NOMINAL_DEPTH' not in variables and 'instrument_nominal_depth' not in attributes: - error_list.append('no NOMINAL_DEPTH') + if ( + "NOMINAL_DEPTH" not in variables + and "instrument_nominal_depth" not in attributes + ): + error_list.append("no NOMINAL_DEPTH") - required_attributes = {'time_deployment_start', 'time_deployment_end'} + required_attributes = {"time_deployment_start", "time_deployment_end"} have_time_attributes = True for attr in required_attributes - attributes: - error_list.append('no {} attribute'.format(attr)) + error_list.append("no {} attribute".format(attr)) have_time_attributes = False # check for existence of in-water data if have_time_attributes and not in_water_index(nc).any(): - error_list.append('no in-water data') + error_list.append("no in-water data") # check qc flag conventions for VoI and depth/pressure error_list.extend(check_imos_flag_conventions(nc)) @@ -139,10 +158,12 @@ def check_file(nc, site_code, variables_of_interest, return error_list -def check_velocity_file(nc, site_code, - required_variables=('TIME', 'DEPTH', 'LATITUDE', 'LONGITUDE', 'UCUR', 'VCUR'), - allowed_dimensions=('TIME', 'LATITUDE', 'LONGITUDE', 'HEIGHT_ABOVE_SENSOR') - ): +def check_velocity_file( + nc, + site_code, + required_variables=("TIME", "DEPTH", "LATITUDE", "LONGITUDE", "UCUR", "VCUR"), + allowed_dimensions=("TIME", "LATITUDE", "LONGITUDE", "HEIGHT_ABOVE_SENSOR"), +): """ Check that a file meets the requirements for inclusion in a product. Return a list of errors @@ -165,8 +186,13 @@ def check_velocity_file(nc, site_code, :return: list of failed tests """ - return check_file(nc, site_code, variables_of_interest=('UCUR', 'VCUR', 'WCUR'), - required_variables=required_variables, allowed_dimensions=allowed_dimensions) + return check_file( + nc, + site_code, + variables_of_interest=("UCUR", "VCUR", "WCUR"), + required_variables=required_variables, + allowed_dimensions=allowed_dimensions, + ) def in_water_index(nc): @@ -177,11 +203,12 @@ def in_water_index(nc): :param nc: xarray dataset :return: numpy.ndarray boolean index array """ - time_deployment_start = np.datetime64(nc.attrs['time_deployment_start'][:-1]) - time_deployment_end = np.datetime64(nc.attrs['time_deployment_end'][:-1]) - TIME = nc['TIME'][:] + time_deployment_start = np.datetime64(nc.attrs["time_deployment_start"][:-1]) + time_deployment_end = np.datetime64(nc.attrs["time_deployment_end"][:-1]) + TIME = nc["TIME"][:] return (TIME >= time_deployment_start) & (TIME <= time_deployment_end) + def in_water(nc): """ cut data to in-water only timestamps, dropping resulting NaN. diff --git a/aodntools/timeseries_products/gridded_timeseries.py b/aodntools/timeseries_products/gridded_timeseries.py index 87de0fc..00600fc 100644 --- a/aodntools/timeseries_products/gridded_timeseries.py +++ b/aodntools/timeseries_products/gridded_timeseries.py @@ -11,11 +11,15 @@ from pkg_resources import resource_filename from aodntools import __version__ -from aodntools.timeseries_products.common import current_utc_timestamp, TIMESTAMP_FORMAT, DATESTAMP_FORMAT +from aodntools.timeseries_products.common import ( + current_utc_timestamp, + TIMESTAMP_FORMAT, + DATESTAMP_FORMAT, +) import aodntools.timeseries_products.aggregated_timeseries as TStools -TEMPLATE_JSON = resource_filename(__name__, 'gridded_timeseries_template.json') +TEMPLATE_JSON = resource_filename(__name__, "gridded_timeseries_template.json") def make_depth_bins(nc, increment=10): @@ -32,6 +36,7 @@ def make_depth_bins(nc, increment=10): return dbin + def adjust_depth_bins(depth_bins, depth_min, depth_max): """ Adjust the provided depth bins to match the min and max registered depth @@ -65,18 +70,18 @@ def get_depth_mask(depth_bins, depths, max_separation): return depth_mask - def sort_depths(depths, values): """ Sort the list of depths and values """ - index =list(range(len(depths))) + index = list(range(len(depths))) index.sort(key=depths.__getitem__) sorted_depths = [depths[i] for i in index] sorted_values = [values[i] for i in index] return sorted_depths, sorted_values + def write_netCDF_aggfile(agg_dataset, output_path, encoding): """ write netcdf file @@ -86,7 +91,7 @@ def write_netCDF_aggfile(agg_dataset, output_path, encoding): :return: name of the netCDf file written """ - agg_dataset.to_netcdf(output_path, encoding=encoding, format='NETCDF4_CLASSIC') + agg_dataset.to_netcdf(output_path, encoding=encoding, format="NETCDF4_CLASSIC") return output_path @@ -104,13 +109,16 @@ def set_variableattr(varlist, variable_attribute_dictionary, add_variable_attrib # with open(templatefile) as json_file: # variable_metadata = json.load(json_file)['_variables'] variable_attributes = {key: variable_attribute_dictionary[key] for key in varlist} - if len(add_variable_attribute)>0: + if len(add_variable_attribute) > 0: for key in add_variable_attribute.keys(): variable_attributes[key].update(add_variable_attribute[key]) return variable_attributes -def generate_netcdf_output_filename(nc, facility_code, data_code, VoI, site_code, product_type, file_version): + +def generate_netcdf_output_filename( + nc, facility_code, data_code, VoI, site_code, product_type, file_version +): """ generate the output filename for the VoI netCDF file @@ -123,20 +131,43 @@ def generate_netcdf_output_filename(nc, facility_code, data_code, VoI, site_code :return: name of the output file """ - if '_' in VoI: - VoI = VoI.replace('_', '-') + if "_" in VoI: + VoI = VoI.replace("_", "-") t_start = pd.to_datetime(nc.TIME.min().values).strftime(DATESTAMP_FORMAT) t_end = pd.to_datetime(nc.TIME.max().values).strftime(DATESTAMP_FORMAT) - output_name = '_'.join(['IMOS', facility_code, data_code, t_start, site_code, ('FV0'+str(file_version)), (VoI+"-"+product_type), ('END-'+ t_end), 'C-' + current_utc_timestamp(DATESTAMP_FORMAT)]) + '.nc' + output_name = ( + "_".join( + [ + "IMOS", + facility_code, + data_code, + t_start, + site_code, + ("FV0" + str(file_version)), + (VoI + "-" + product_type), + ("END-" + t_end), + "C-" + current_utc_timestamp(DATESTAMP_FORMAT), + ] + ) + + ".nc" + ) return output_name - ## MAIN FUNCTION -def grid_variable(input_file, VoI, depth_bins=None, max_separation=50, depth_bins_increment=10, - input_dir='', output_dir='.', download_url_prefix=None, opendap_url_prefix=None): +def grid_variable( + input_file, + VoI, + depth_bins=None, + max_separation=50, + depth_bins_increment=10, + input_dir="", + output_dir=".", + download_url_prefix=None, + opendap_url_prefix=None, +): """ Grid VoI into depth_bins. :param input_file: Input hourly aggregated file with VoI, DEPTH and TIME only (path interpreted relative @@ -153,7 +184,7 @@ def grid_variable(input_file, VoI, depth_bins=None, max_separation=50, depth_bin """ with xr.open_dataset(os.path.join(input_dir, input_file)) as nc_full: - nc = nc_full[[VoI, 'TIME', 'DEPTH']] + nc = nc_full[[VoI, "TIME", "DEPTH"]] ## get lat/lon longitude_mean = nc_full.LONGITUDE.mean() latitude_mean = nc_full.LATITUDE.mean() @@ -162,33 +193,37 @@ def grid_variable(input_file, VoI, depth_bins=None, max_separation=50, depth_bin ## get global attributes input_global_attributes = nc.attrs - - ## in case no depth bins provided, create depth bins to the nearest rounded depth ## if provided, adjust to the min-max registered depth if not depth_bins: depth_bins = make_depth_bins(nc=nc, increment=depth_bins_increment) else: - depth_bins = [float(depth) for depth in depth_bins] # in case depth_bins provided through inline arguments + depth_bins = [ + float(depth) for depth in depth_bins + ] # in case depth_bins provided through inline arguments depth_bins = adjust_depth_bins(depth_bins, nc.DEPTH.min(), nc.DEPTH.max()) ## create empty containers - time_0 = pd.to_datetime('1950-01-01T00:00:00') + time_0 = pd.to_datetime("1950-01-01T00:00:00") time_min = nc.TIME.values.min() depth_bin_len = len(depth_bins) ## create empty containers for the interpolated values - VoI_temp = xr.DataArray(np.full((depth_bin_len, 1), np.nan, dtype=np.float32), coords=[depth_bins, [time_0]], - dims=['DEPTH', 'TIME']) - VoI_ndepths = xr.DataArray(np.full(1, 0, dtype='int'), coords=[[time_0]], dims=['TIME']) + VoI_temp = xr.DataArray( + np.full((depth_bin_len, 1), np.nan, dtype=np.float32), + coords=[depth_bins, [time_0]], + dims=["DEPTH", "TIME"], + ) + VoI_ndepths = xr.DataArray( + np.full(1, 0, dtype="int"), coords=[[time_0]], dims=["TIME"] + ) ## group nc by individual timestamps - VoI_grouped = nc.groupby('TIME') - + VoI_grouped = nc.groupby("TIME") for timestamp, group in VoI_grouped: time = [timestamp] - n_depths = np.array(len(group[VoI]), dtype='int') + n_depths = np.array(len(group[VoI]), dtype="int") if n_depths >= 2: VoI_values = list(group[VoI].values) @@ -197,9 +232,13 @@ def grid_variable(input_file, VoI, depth_bins=None, max_separation=50, depth_bin depth, VoI_values = sort_depths(depth, VoI_values) ## check for max separation - depth_mask = get_depth_mask(depth_bins=depth_bins, depths=depth, max_separation=max_separation) + depth_mask = get_depth_mask( + depth_bins=depth_bins, depths=depth, max_separation=max_separation + ) ## do the interpolation - interpolated_var = np.interp(depth_bins, depth, VoI_values, left=np.nan, right=np.nan) + interpolated_var = np.interp( + depth_bins, depth, VoI_values, left=np.nan, right=np.nan + ) ## set masked depth bins to zero interpolated_var = interpolated_var * depth_mask interpolated_var[interpolated_var == 0] = np.nan @@ -207,108 +246,148 @@ def grid_variable(input_file, VoI, depth_bins=None, max_separation=50, depth_bin else: interpolated_var = np.full((depth_bin_len, 1), np.nan) - VoI_temp_tmp = xr.DataArray(interpolated_var.reshape(depth_bin_len, 1), coords=[depth_bins, time], - dims=['DEPTH', 'TIME']) - VoI_ndepths_tmp = xr.DataArray([n_depths], coords=[time], dims=['TIME']) + VoI_temp_tmp = xr.DataArray( + interpolated_var.reshape(depth_bin_len, 1), + coords=[depth_bins, time], + dims=["DEPTH", "TIME"], + ) + VoI_ndepths_tmp = xr.DataArray([n_depths], coords=[time], dims=["TIME"]) ## concatenate the interpolated values - VoI_temp = xr.concat([VoI_temp, VoI_temp_tmp], dim='TIME') - VoI_ndepths = xr.concat([VoI_ndepths, VoI_ndepths_tmp], dim='TIME') + VoI_temp = xr.concat([VoI_temp, VoI_temp_tmp], dim="TIME") + VoI_ndepths = xr.concat([VoI_ndepths, VoI_ndepths_tmp], dim="TIME") - VoI_interpolated = xr.Dataset({VoI: VoI_temp.astype(np.float32), - VoI + '_count': VoI_ndepths.astype('int16')}) + VoI_interpolated = xr.Dataset( + {VoI: VoI_temp.astype(np.float32), VoI + "_count": VoI_ndepths.astype("int16")} + ) ## drop the very first record as it is dummy - VoI_interpolated = VoI_interpolated.where(VoI_interpolated.TIME >= time_min, drop=True) + VoI_interpolated = VoI_interpolated.where( + VoI_interpolated.TIME >= time_min, drop=True + ) ## Add lat/lon as scalar variables - VoI_interpolated = VoI_interpolated.assign(LONGITUDE = longitude_mean, - LATITUDE = latitude_mean) + VoI_interpolated = VoI_interpolated.assign( + LONGITUDE=longitude_mean, LATITUDE=latitude_mean + ) ## transpose dimensions to make CF compliant - VoI_interpolated = VoI_interpolated.transpose('TIME', 'DEPTH') + VoI_interpolated = VoI_interpolated.transpose("TIME", "DEPTH") ## get the variables attribute dictionary with open(TEMPLATE_JSON) as json_file: attr_dictionary = json.load(json_file) - variable_attribute_dictionary = attr_dictionary['_variables'] - global_attribute_dictionary = attr_dictionary['_global'] + variable_attribute_dictionary = attr_dictionary["_variables"] + global_attribute_dictionary = attr_dictionary["_global"] ## set variable attributes varlist = list(VoI_interpolated.variables) add_variable_attribute = {} - variable_attributes = set_variableattr(varlist, variable_attribute_dictionary, add_variable_attribute) - time_units = variable_attributes['TIME'].pop('units') - time_calendar = variable_attributes['TIME'].pop('calendar') + variable_attributes = set_variableattr( + varlist, variable_attribute_dictionary, add_variable_attribute + ) + time_units = variable_attributes["TIME"].pop("units") + time_calendar = variable_attributes["TIME"].pop("calendar") for variable in varlist: VoI_interpolated[variable].attrs = variable_attributes[variable] ## set global attributes # copy selected attributes from input file - for attr in ('geospatial_lat_min', 'geospatial_lat_max', 'geospatial_lon_min', 'geospatial_lon_max', 'site_code', - 'included_values_flagged_as', 'contributor_name', 'contributor_role', 'contributor_email'): + for attr in ( + "geospatial_lat_min", + "geospatial_lat_max", + "geospatial_lon_min", + "geospatial_lon_max", + "site_code", + "included_values_flagged_as", + "contributor_name", + "contributor_role", + "contributor_email", + ): VoI_interpolated.attrs[attr] = input_global_attributes[attr] - date_start = pd.to_datetime(VoI_interpolated.TIME.values.min()).strftime(TIMESTAMP_FORMAT) - date_end = pd.to_datetime(VoI_interpolated.TIME.values.max()).strftime(TIMESTAMP_FORMAT) + date_start = pd.to_datetime(VoI_interpolated.TIME.values.min()).strftime( + TIMESTAMP_FORMAT + ) + date_end = pd.to_datetime(VoI_interpolated.TIME.values.max()).strftime( + TIMESTAMP_FORMAT + ) date_created = current_utc_timestamp() VoI_interpolated.attrs.update(global_attribute_dictionary) - VoI_interpolated.attrs.update({ - 'source_file': input_file, - 'time_coverage_start': date_start, - 'time_coverage_end': date_end, - 'geospatial_vertical_min': min(depth_bins), - 'geospatial_vertical_max': max(depth_bins), - 'keywords': ', '.join([VoI, 'DEPTH'] + ['HOURLY', 'GRIDDED']), - 'abstract': global_attribute_dictionary['abstract'].format(VoI=VoI, site_code=site_code), - 'date_created': date_created, - 'history': input_global_attributes['history'] +' {date_created}: Gridded file created.'.format( - date_created=date_created), - 'generating_code_version': __version__, - 'title': global_attribute_dictionary['title'].format(VoI=VoI, - site_code=site_code, - time_min=date_start, - time_max=date_end, - depth_min=min(depth_bins), - depth_max = max(depth_bins)) - }) - github_comment = ('\nThis file was created using https://github.com/aodn/python-aodntools/blob/' - '{v}/aodntools/timeseries_products/{f}'.format(v=__version__, f=os.path.basename(__file__)) - ) - VoI_interpolated.attrs['lineage'] += github_comment + VoI_interpolated.attrs.update( + { + "source_file": input_file, + "time_coverage_start": date_start, + "time_coverage_end": date_end, + "geospatial_vertical_min": min(depth_bins), + "geospatial_vertical_max": max(depth_bins), + "keywords": ", ".join([VoI, "DEPTH"] + ["HOURLY", "GRIDDED"]), + "abstract": global_attribute_dictionary["abstract"].format( + VoI=VoI, site_code=site_code + ), + "date_created": date_created, + "history": input_global_attributes["history"] + + " {date_created}: Gridded file created.".format( + date_created=date_created + ), + "generating_code_version": __version__, + "title": global_attribute_dictionary["title"].format( + VoI=VoI, + site_code=site_code, + time_min=date_start, + time_max=date_end, + depth_min=min(depth_bins), + depth_max=max(depth_bins), + ), + } + ) + github_comment = ( + "\nThis file was created using https://github.com/aodn/python-aodntools/blob/" + "{v}/aodntools/timeseries_products/{f}".format( + v=__version__, f=os.path.basename(__file__) + ) + ) + VoI_interpolated.attrs["lineage"] += github_comment if download_url_prefix: - VoI_interpolated.attrs['source_file_download'] = os.path.join(download_url_prefix, input_file) + VoI_interpolated.attrs["source_file_download"] = os.path.join( + download_url_prefix, input_file + ) if opendap_url_prefix: - VoI_interpolated.attrs['source_file_opendap'] = os.path.join(opendap_url_prefix, input_file) + VoI_interpolated.attrs["source_file_opendap"] = os.path.join( + opendap_url_prefix, input_file + ) VoI_interpolated.attrs = sorted(VoI_interpolated.attrs.items()) ## create the output file name and write the aggregated product as netCDF facility_code = TStools.get_facility_code(input_file) - data_code = TStools.get_data_code(VoI) + 'Z' - product_type = 'gridded-timeseries' + data_code = TStools.get_data_code(VoI) + "Z" + product_type = "gridded-timeseries" file_version = 2 - ncout_filename = generate_netcdf_output_filename(nc=VoI_interpolated, facility_code=facility_code, - data_code=data_code, VoI=VoI, - site_code=site_code, product_type=product_type, - file_version=file_version) + ncout_filename = generate_netcdf_output_filename( + nc=VoI_interpolated, + facility_code=facility_code, + data_code=data_code, + VoI=VoI, + site_code=site_code, + product_type=product_type, + file_version=file_version, + ) ncout_path = os.path.join(output_dir, ncout_filename) - encoding = {'TIME': {'_FillValue': None, - 'units': time_units, - 'calendar': time_calendar, - 'zlib': True, - 'complevel': 5}, - VoI: {'zlib': True, - 'complevel': 5, - 'dtype': np.dtype('float32')}, - VoI+'_count': {'dtype': np.dtype('int16'), - 'zlib': True, - 'complevel': 5}, - 'DEPTH': {'dtype': np.dtype('float32'), - 'zlib': True, - 'complevel': 5}, - 'LONGITUDE': {'_FillValue': False}, - 'LATITUDE': {'_FillValue': False}} + encoding = { + "TIME": { + "_FillValue": None, + "units": time_units, + "calendar": time_calendar, + "zlib": True, + "complevel": 5, + }, + VoI: {"zlib": True, "complevel": 5, "dtype": np.dtype("float32")}, + VoI + "_count": {"dtype": np.dtype("int16"), "zlib": True, "complevel": 5}, + "DEPTH": {"dtype": np.dtype("float32"), "zlib": True, "complevel": 5}, + "LONGITUDE": {"_FillValue": False}, + "LATITUDE": {"_FillValue": False}, + } write_netCDF_aggfile(VoI_interpolated, ncout_path, encoding) @@ -316,28 +395,77 @@ def grid_variable(input_file, VoI, depth_bins=None, max_separation=50, depth_bin if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Gridded time series: interpolate ONE variable from ALL instruments from ALL deployments from ONE site into 1hr timestamps and fixed depth bins") - parser.add_argument('-var', dest='var', help='name of the variable to concatenate. Like TEMP, PSAL', default='TEMP', required=False) - parser.add_argument('-file', dest='filename', help='name of the Hourly Time Series Product file that contains the data', default=None, required=False) - parser.add_argument('-depth_bins', dest='depth_bins', help='list of depth where the VoI will be interpolated', default=None, nargs='+', required=False) - parser.add_argument('-max_separation', dest='max_separation', help='maximum difference between instruments to allow interpolation', default=50, required=False) - parser.add_argument('-depth_bins_increment', dest='depth_bins_increment', help='increment in meters for the automatic generated depth bins', default=10, required=False) - parser.add_argument('-indir', dest='input_dir', help='base path of input file. Default .', default='.', - required=False) - parser.add_argument('-outdir', dest='output_dir', help='path where the result file will be written. Default .', - default='.', required=False) - parser.add_argument('-config', dest='config_file', help='JSON configuration file', default=None, required=False) + parser = argparse.ArgumentParser( + description="Gridded time series: interpolate ONE variable from ALL instruments from ALL deployments from ONE site into 1hr timestamps and fixed depth bins" + ) + parser.add_argument( + "-var", + dest="var", + help="name of the variable to concatenate. Like TEMP, PSAL", + default="TEMP", + required=False, + ) + parser.add_argument( + "-file", + dest="filename", + help="name of the Hourly Time Series Product file that contains the data", + default=None, + required=False, + ) + parser.add_argument( + "-depth_bins", + dest="depth_bins", + help="list of depth where the VoI will be interpolated", + default=None, + nargs="+", + required=False, + ) + parser.add_argument( + "-max_separation", + dest="max_separation", + help="maximum difference between instruments to allow interpolation", + default=50, + required=False, + ) + parser.add_argument( + "-depth_bins_increment", + dest="depth_bins_increment", + help="increment in meters for the automatic generated depth bins", + default=10, + required=False, + ) + parser.add_argument( + "-indir", + dest="input_dir", + help="base path of input file. Default .", + default=".", + required=False, + ) + parser.add_argument( + "-outdir", + dest="output_dir", + help="path where the result file will be written. Default .", + default=".", + required=False, + ) + parser.add_argument( + "-config", + dest="config_file", + help="JSON configuration file", + default=None, + required=False, + ) args = parser.parse_args() if args.config_file: with open(args.config_file) as ff: arguments = json.load(ff) - VoI = arguments['var'] - depth_bins = arguments['depth_bins'] - depth_bins_increment = int(arguments['depth_bins_increment']) - max_separation = int(arguments['max_separation']) - input_dir = arguments.get('input_dir', '.') - output_dir = arguments.get('output_dir', '.') + VoI = arguments["var"] + depth_bins = arguments["depth_bins"] + depth_bins_increment = int(arguments["depth_bins_increment"]) + max_separation = int(arguments["max_separation"]) + input_dir = arguments.get("input_dir", ".") + output_dir = arguments.get("output_dir", ".") else: VoI = args.var depth_bins = args.depth_bins @@ -348,6 +476,14 @@ def grid_variable(input_file, VoI, depth_bins=None, max_separation=50, depth_bin file_name = args.filename - print(grid_variable(input_file=file_name, VoI=VoI, depth_bins=depth_bins, - max_separation=int(max_separation), depth_bins_increment=int(depth_bins_increment), - input_dir=input_dir, output_dir=output_dir)) + print( + grid_variable( + input_file=file_name, + VoI=VoI, + depth_bins=depth_bins, + max_separation=int(max_separation), + depth_bins_increment=int(depth_bins_increment), + input_dir=input_dir, + output_dir=output_dir, + ) + ) diff --git a/aodntools/timeseries_products/hourly_timeseries.py b/aodntools/timeseries_products/hourly_timeseries.py index bc07953..1785b7a 100644 --- a/aodntools/timeseries_products/hourly_timeseries.py +++ b/aodntools/timeseries_products/hourly_timeseries.py @@ -13,14 +13,21 @@ from aodntools import __version__ from aodntools.timeseries_products import aggregated_timeseries as utils -from aodntools.timeseries_products.common import (NoInputFilesError, check_file, get_qc_variable_names, in_water, - current_utc_timestamp, TIMESTAMP_FORMAT, DATESTAMP_FORMAT) +from aodntools.timeseries_products.common import ( + NoInputFilesError, + check_file, + get_qc_variable_names, + in_water, + current_utc_timestamp, + TIMESTAMP_FORMAT, + DATESTAMP_FORMAT, +) -TEMPLATE_JSON = resource_filename(__name__, 'hourly_timeseries_template.json') -BINNING_METHOD_JSON = resource_filename(__name__, 'binning_method.json') +TEMPLATE_JSON = resource_filename(__name__, "hourly_timeseries_template.json") +BINNING_METHOD_JSON = resource_filename(__name__, "binning_method.json") -def check_files(file_list, site_code, parameter_names_accepted, input_dir=''): +def check_files(file_list, site_code, parameter_names_accepted, input_dir=""): """ Return a chronologically sorted file_list and a dictionary if the file fails one or more of the tests @@ -40,19 +47,19 @@ def check_files(file_list, site_code, parameter_names_accepted, input_dir=''): if error_list: error_dict.update({file: error_list}) else: - file_list_dataframe = file_list_dataframe.append({'url': file, - 'deployment_date': parse(nc.time_deployment_start)}, - ignore_index=True) + file_list_dataframe = file_list_dataframe.append( + {"url": file, "deployment_date": parse(nc.time_deployment_start)}, + ignore_index=True, + ) - file_list_dataframe = file_list_dataframe.sort_values(by='deployment_date') - file_list = file_list_dataframe['url'].to_list() + file_list_dataframe = file_list_dataframe.sort_values(by="deployment_date") + file_list = file_list_dataframe["url"].to_list() if file_list == []: raise NoInputFilesError("no valid input files to aggregate") return file_list, error_dict - def get_parameter_names(nc): """ get the names of the parameters that HAVE _quality_control ancillary var @@ -61,7 +68,10 @@ def get_parameter_names(nc): :param nc: xarray dataset :return: list of names """ - params = list(set([s.strip('_quality_control') for s in get_qc_variable_names(nc)]) - set(list(nc.coords))) + params = list( + set([s.strip("_quality_control") for s in get_qc_variable_names(nc)]) + - set(list(nc.coords)) + ) return params @@ -74,14 +84,20 @@ def good_data_only(nc, qcflags): :return: xarray masked Dataset, dictionary of % of qced values per variable """ varnames = get_parameter_names(nc) - nc_masked = nc[varnames[0]].where(nc[varnames[0] + '_quality_control'].isin(qcflags)).to_dataset(name=varnames[0]) + nc_masked = ( + nc[varnames[0]] + .where(nc[varnames[0] + "_quality_control"].isin(qcflags)) + .to_dataset(name=varnames[0]) + ) for variable in varnames[1:]: - nc_masked[variable] = nc[variable].where(nc[variable + '_quality_control'].isin(qcflags)) + nc_masked[variable] = nc[variable].where( + nc[variable + "_quality_control"].isin(qcflags) + ) return nc_masked -def get_QCcount (nc, qcflags): +def get_QCcount(nc, qcflags): """ count the number of qced values in the file :param nc: xarray dataset @@ -89,18 +105,21 @@ def get_QCcount (nc, qcflags): :return: dictionary with % of registers QCed """ qc_total_count = {} - if 0 in qcflags and len(qcflags)>1: + if 0 in qcflags and len(qcflags) > 1: varnames = get_parameter_names(nc) for variable in varnames: flag_count = [] for flag in qcflags: - flag_count.append(int(np.sum(nc[variable+'_quality_control']==flag))) - qc_total_count[variable] = {'qc0_count': flag_count[0]} - qc_total_count[variable].update({'qcnon0_count': sum(flag_count[1:])}) + flag_count.append( + int(np.sum(nc[variable + "_quality_control"] == flag)) + ) + qc_total_count[variable] = {"qc0_count": flag_count[0]} + qc_total_count[variable].update({"qcnon0_count": sum(flag_count[1:])}) return qc_total_count + def update_QCcount(qc_count_all, qc_count): """ Update qc count dictionary @@ -110,13 +129,14 @@ def update_QCcount(qc_count_all, qc_count): """ for variable in qc_count.keys(): if variable in qc_count_all.keys(): - qc_count_all[variable]['qc0_count'] += qc_count[variable]['qc0_count'] - qc_count_all[variable]['qcnon0_count'] += qc_count[variable]['qcnon0_count'] + qc_count_all[variable]["qc0_count"] += qc_count[variable]["qc0_count"] + qc_count_all[variable]["qcnon0_count"] += qc_count[variable]["qcnon0_count"] else: qc_count_all[variable] = qc_count[variable] return qc_count_all + def get_QC_percent(qc_count): """ Calculate the % of qc values in the variables of a file @@ -126,15 +146,27 @@ def get_QC_percent(qc_count): qc_percent = {} if len(qc_count) > 0: for variable in qc_count.keys(): - if qc_count[variable]['qcnon0_count'] > 0: - qc_percent[variable] = {'percent_quality_controlled': round(100*(1-qc_count[variable]['qc0_count']/(qc_count[variable]['qcnon0_count'] + qc_count[variable]['qc0_count'])),2)} + if qc_count[variable]["qcnon0_count"] > 0: + qc_percent[variable] = { + "percent_quality_controlled": round( + 100 + * ( + 1 + - qc_count[variable]["qc0_count"] + / ( + qc_count[variable]["qcnon0_count"] + + qc_count[variable]["qc0_count"] + ) + ), + 2, + ) + } else: - qc_percent[variable] = {'percent_quality_controlled': 0.00} + qc_percent[variable] = {"percent_quality_controlled": 0.00} return qc_percent - def get_nominal_depth(nc): """ return nominal depth from NOMINAL_DEPTH variable or @@ -144,7 +176,7 @@ def get_nominal_depth(nc): :return: nominal depth of the instrument """ - if 'NOMINAL_DEPTH' in list(nc.variables): + if "NOMINAL_DEPTH" in list(nc.variables): nominal_depth = nc.NOMINAL_DEPTH.squeeze().values else: nominal_depth = nc.instrument_nominal_depth @@ -152,7 +184,9 @@ def get_nominal_depth(nc): return nominal_depth -def set_globalattr(nc_aggregated, templatefile, site_code, add_attribute, parameter_names): +def set_globalattr( + nc_aggregated, templatefile, site_code, add_attribute, parameter_names +): """ global attributes from a reference nc file and nc file @@ -164,25 +198,36 @@ def set_globalattr(nc_aggregated, templatefile, site_code, add_attribute, parame :return: dictionary of global attributes """ - timeformat = '%Y-%m-%dT%H:%M:%SZ' + timeformat = "%Y-%m-%dT%H:%M:%SZ" with open(templatefile) as json_file: global_metadata = json.load(json_file)["_global"] - agg_attr = {'title': ("Long time series Hourly Aggregated product: all available non-velocity variables at " + - site_code + " between " + pd.to_datetime(nc_aggregated.TIME.values.min()).strftime(timeformat) + - " and " + pd.to_datetime(nc_aggregated.TIME.values.max()).strftime(timeformat)), - 'site_code': site_code, - 'time_coverage_start': pd.to_datetime(nc_aggregated.TIME.values.min()).strftime(timeformat), - 'time_coverage_end': pd.to_datetime(nc_aggregated.TIME.values.max()).strftime(timeformat), - 'geospatial_vertical_min': float(nc_aggregated.DEPTH.min()), - 'geospatial_vertical_max': float(nc_aggregated.DEPTH.max()), - 'geospatial_lat_min': nc_aggregated.LATITUDE.values.min(), - 'geospatial_lat_max': nc_aggregated.LATITUDE.values.max(), - 'geospatial_lon_min': nc_aggregated.LONGITUDE.values.min(), - 'geospatial_lon_max': nc_aggregated.LONGITUDE.values.max(), - 'date_created': current_utc_timestamp(), - 'history': current_utc_timestamp() + ': Hourly aggregated file created.', - 'keywords': ', '.join(parameter_names + ['HOURLY', 'AGGREGATED'])} + agg_attr = { + "title": ( + "Long time series Hourly Aggregated product: all available non-velocity variables at " + + site_code + + " between " + + pd.to_datetime(nc_aggregated.TIME.values.min()).strftime(timeformat) + + " and " + + pd.to_datetime(nc_aggregated.TIME.values.max()).strftime(timeformat) + ), + "site_code": site_code, + "time_coverage_start": pd.to_datetime(nc_aggregated.TIME.values.min()).strftime( + timeformat + ), + "time_coverage_end": pd.to_datetime(nc_aggregated.TIME.values.max()).strftime( + timeformat + ), + "geospatial_vertical_min": float(nc_aggregated.DEPTH.min()), + "geospatial_vertical_max": float(nc_aggregated.DEPTH.max()), + "geospatial_lat_min": nc_aggregated.LATITUDE.values.min(), + "geospatial_lat_max": nc_aggregated.LATITUDE.values.max(), + "geospatial_lon_min": nc_aggregated.LONGITUDE.values.min(), + "geospatial_lon_max": nc_aggregated.LONGITUDE.values.max(), + "date_created": current_utc_timestamp(), + "history": current_utc_timestamp() + ": Hourly aggregated file created.", + "keywords": ", ".join(parameter_names + ["HOURLY", "AGGREGATED"]), + } global_metadata.update(agg_attr) global_metadata.update(add_attribute) @@ -198,24 +243,26 @@ def get_data_code(VoI): """ # dictionary of data code. better if it is read from external file - dataCodes = {'DEPTH': 'Z', - 'PRES': 'Z', - 'PRES_REL': 'Z', - 'TEMP': 'T', - 'PSAL': 'S', - 'CNDC': 'C', - 'PAR': 'F', - 'TURB': 'U', - 'TURBF': 'U', - 'DOX1': 'O', - 'DOX1_2': 'O', - 'DOX1_3': 'O', - 'DOX2': 'O', - 'DOX2_1': 'O', - 'DOXS': 'O', - 'CPHL': 'B', - 'CHLU': 'B', - 'CHLF': 'B'} + dataCodes = { + "DEPTH": "Z", + "PRES": "Z", + "PRES_REL": "Z", + "TEMP": "T", + "PSAL": "S", + "CNDC": "C", + "PAR": "F", + "TURB": "U", + "TURBF": "U", + "DOX1": "O", + "DOX1_2": "O", + "DOX1_3": "O", + "DOX2": "O", + "DOX2_1": "O", + "DOXS": "O", + "CPHL": "B", + "CHLU": "B", + "CHLF": "B", + } if VoI in dataCodes: dCode = dataCodes[VoI] @@ -246,7 +293,9 @@ def create_empty_dataframe(columns): return pd.DataFrame({k: pd.Series(dtype=t) for k, t in columns}) -def generate_netcdf_output_filename(nc, facility_code, data_code, site_code, product_type, file_version): +def generate_netcdf_output_filename( + nc, facility_code, data_code, site_code, product_type, file_version +): """ generate the output filename for the VoI netCDF file @@ -262,14 +311,26 @@ def generate_netcdf_output_filename(nc, facility_code, data_code, site_code, pro t_start = pd.to_datetime(nc.TIME.min().values).strftime(DATESTAMP_FORMAT) t_end = pd.to_datetime(nc.TIME.max().values).strftime(DATESTAMP_FORMAT) - output_name = '_'.join( - ['IMOS', facility_code, data_code, t_start, site_code, ('FV0' + str(file_version)), product_type, - ('END-' + t_end), 'C-' + current_utc_timestamp(DATESTAMP_FORMAT)]) + '.nc' + output_name = ( + "_".join( + [ + "IMOS", + facility_code, + data_code, + t_start, + site_code, + ("FV0" + str(file_version)), + product_type, + ("END-" + t_end), + "C-" + current_utc_timestamp(DATESTAMP_FORMAT), + ] + ) + + ".nc" + ) return output_name - def write_netCDF_aggfile(nc_aggregated, ncout_filename, encoding): """ write netcdf file @@ -281,13 +342,26 @@ def write_netCDF_aggfile(nc_aggregated, ncout_filename, encoding): """ ## sort the variables in the data set variables_all = list(nc_aggregated.variables) - variables_head = ['instrument_index', 'instrument_id', 'source_file', 'TIME', 'LONGITUDE', 'LATITUDE', - 'NOMINAL_DEPTH', 'DEPTH', 'DEPTH_count', 'DEPTH_min', 'DEPTH_max', 'DEPTH_std', ] + variables_head = [ + "instrument_index", + "instrument_id", + "source_file", + "TIME", + "LONGITUDE", + "LATITUDE", + "NOMINAL_DEPTH", + "DEPTH", + "DEPTH_count", + "DEPTH_min", + "DEPTH_max", + "DEPTH_std", + ] variables_rest = sorted(list(set(variables_all) - set(variables_head))) variables_all = variables_head + variables_rest - nc_aggregated[variables_all].to_netcdf(ncout_filename, encoding=encoding, - format='NETCDF4_CLASSIC') + nc_aggregated[variables_all].to_netcdf( + ncout_filename, encoding=encoding, format="NETCDF4_CLASSIC" + ) return ncout_filename @@ -308,7 +382,9 @@ def PDresample_by_hour(df, function_dict, function_stats): df_data = pd.DataFrame(index=pd.DatetimeIndex([])) for variable in varnames: ds_var = df[variable] - ds_var_resample = ds_var.resample('1H', base=0.5) # shift by half hour to centre bin on the hour + ds_var_resample = ds_var.resample( + "1H", base=0.5 + ) # shift by half hour to centre bin on the hour ds_var_mean = ds_var_resample.apply(function_dict[variable]).astype(np.float32) df_data = pd.concat([df_data, ds_var_mean], axis=1, sort=False) for stat_method in function_stats: @@ -317,16 +393,21 @@ def PDresample_by_hour(df, function_dict, function_stats): df_data = pd.concat([df_data, ds_var_stat], axis=1, sort=False) ##forward the index 30min so the timestamps are on the hour - df_data.index += pd.to_timedelta('30min') + df_data.index += pd.to_timedelta("30min") return df_data - - ### MAIN FUNCTION -def hourly_aggregator(files_to_aggregate, site_code, qcflags, input_dir='', output_dir='./', - download_url_prefix=None, opendap_url_prefix=None): +def hourly_aggregator( + files_to_aggregate, + site_code, + qcflags, + input_dir="", + output_dir="./", + download_url_prefix=None, + opendap_url_prefix=None, +): """ Aggregate a dataset into 1 hour intervals and calculate related statistics @@ -341,39 +422,62 @@ def hourly_aggregator(files_to_aggregate, site_code, qcflags, input_dir='', outp :return: tuple (path of the output file, dict of rejected files and error messages) """ - parameter_names_accepted = ['DEPTH', 'CPHL', 'CHLF', 'CHLU', 'DOX', 'DOX1', 'DOX1_2', 'DOX1_3', 'DOX2', - 'DOX2_1', 'DOXS', 'DOXY', 'PRES', 'PRES_REL', 'PSAL', 'TEMP', 'TURB', 'TURBF', 'PAR'] - function_stats = ['min', 'max', 'std', 'count'] - qcflags_names = {0: 'No_QC_performed', 1: 'Good_data', 2: 'Probably_good_data', - 3: 'Bad_data_that_are_potentially_correctable', 4: 'Bad_data'} - - + parameter_names_accepted = [ + "DEPTH", + "CPHL", + "CHLF", + "CHLU", + "DOX", + "DOX1", + "DOX1_2", + "DOX1_3", + "DOX2", + "DOX2_1", + "DOXS", + "DOXY", + "PRES", + "PRES_REL", + "PSAL", + "TEMP", + "TURB", + "TURBF", + "PAR", + ] + function_stats = ["min", "max", "std", "count"] + qcflags_names = { + 0: "No_QC_performed", + 1: "Good_data", + 2: "Probably_good_data", + 3: "Bad_data_that_are_potentially_correctable", + 4: "Bad_data", + } ## make sure that the list of qflags is sorted qcflags = sorted(qcflags) # Check files and sort chronologically - files_to_aggregate, bad_files = check_files(files_to_aggregate, site_code, parameter_names_accepted, - input_dir=input_dir) + files_to_aggregate, bad_files = check_files( + files_to_aggregate, site_code, parameter_names_accepted, input_dir=input_dir + ) ## get binning function dictionary with open(BINNING_METHOD_JSON) as json_file: function_dict = json.load(json_file) - ## get the variables attribute dictionary with open(TEMPLATE_JSON) as json_file: - variable_attribute_dictionary = json.load(json_file)['_variables'] + variable_attribute_dictionary = json.load(json_file)["_variables"] df_data = pd.DataFrame() - ## create empty DF with dtypes - metadata_df_types = [('source_file', str), - ('instrument_id', str), - ('LONGITUDE', float), - ('LATITUDE', float), - ('NOMINAL_DEPTH', float)] + metadata_df_types = [ + ("source_file", str), + ("instrument_id", str), + ("LONGITUDE", float), + ("LATITUDE", float), + ("NOMINAL_DEPTH", float), + ] df_metadata = create_empty_dataframe(metadata_df_types) ## containers @@ -383,13 +487,17 @@ def hourly_aggregator(files_to_aggregate, site_code, qcflags, input_dir='', outp for file_index, file in enumerate(files_to_aggregate): print(file_index) - with xr.open_dataset(os.path.join(input_dir, file), mask_and_scale=True, decode_times=True) as nc: - parameter_names = list(set(list(nc.variables)) & set(parameter_names_accepted)) + with xr.open_dataset( + os.path.join(input_dir, file), mask_and_scale=True, decode_times=True + ) as nc: + parameter_names = list( + set(list(nc.variables)) & set(parameter_names_accepted) + ) parameter_names_all += parameter_names ## get PRES_REl offset, if exits - if 'PRES_REL' in parameter_names: - if 'applied_offset' in nc.PRES_REL.attrs: + if "PRES_REL" in parameter_names: + if "applied_offset" in nc.PRES_REL.attrs: applied_offset.append(nc.PRES_REL.applied_offset) else: applied_offset.append(np.nan) @@ -398,42 +506,61 @@ def hourly_aggregator(files_to_aggregate, site_code, qcflags, input_dir='', outp qc_count = get_QCcount(nc_clean, qcflags) qc_count_all = update_QCcount(qc_count_all, qc_count) nc_clean = good_data_only(nc_clean, qcflags) # good quality data only - df_metadata = df_metadata.append({'source_file': file, - 'instrument_id': utils.get_instrument_id(nc), - 'LONGITUDE': nc.LONGITUDE.squeeze().values, - 'LATITUDE': nc.LATITUDE.squeeze().values, - 'NOMINAL_DEPTH': get_nominal_depth(nc)}, - ignore_index=True) + df_metadata = df_metadata.append( + { + "source_file": file, + "instrument_id": utils.get_instrument_id(nc), + "LONGITUDE": nc.LONGITUDE.squeeze().values, + "LATITUDE": nc.LATITUDE.squeeze().values, + "NOMINAL_DEPTH": get_nominal_depth(nc), + }, + ignore_index=True, + ) # If TIME had out-of-range values before cleaning, nc_clean would now have a CFTimeIndex, which # breaks the resampling further down. Here we reset it to a DatetimeIndex as suggested here: # https://stackoverflow.com/questions/55786995/converting-cftime-datetimejulian-to-datetime/55787899#55787899 - if isinstance(nc_clean.indexes['TIME'], xr.coding.cftimeindex.CFTimeIndex): - nc_clean['TIME'] = nc_clean.indexes['TIME'].to_datetimeindex() + if isinstance(nc_clean.indexes["TIME"], xr.coding.cftimeindex.CFTimeIndex): + nc_clean["TIME"] = nc_clean.indexes["TIME"].to_datetimeindex() df_temp = nc_clean[parameter_names].to_dataframe() ## keep TIME as the only index (for ADCP files it would be a MultiIndex at this point) df_temp.reset_index(inplace=True) - df_temp.set_index('TIME', inplace=True) + df_temp.set_index("TIME", inplace=True) df_temp = df_temp[parameter_names] - df_temp = PDresample_by_hour(df_temp, function_dict, function_stats) # do the magic - df_temp['instrument_index'] = np.repeat(file_index, len(df_temp)).astype(np.int32) - df_data = pd.concat([df_data, df_temp.reset_index()], ignore_index=True, sort=False) - - df_metadata.index.rename('INSTRUMENT', inplace=True) - df_data.index.rename('OBSERVATION', inplace=True) + df_temp = PDresample_by_hour( + df_temp, function_dict, function_stats + ) # do the magic + df_temp["instrument_index"] = np.repeat(file_index, len(df_temp)).astype( + np.int32 + ) + df_data = pd.concat( + [df_data, df_temp.reset_index()], ignore_index=True, sort=False + ) + + df_metadata.index.rename("INSTRUMENT", inplace=True) + df_data.index.rename("OBSERVATION", inplace=True) ## rename index to TIME - df_data.rename(columns={'index': 'TIME'}, inplace=True) + df_data.rename(columns={"index": "TIME"}, inplace=True) qc_proportion_all = get_QC_percent(qc_count_all) - - nc_metadata = xr.Dataset({'LONGITUDE': (['INSTRUMENT'], df_metadata['LONGITUDE'].astype('float64')), - 'LATITUDE': (['INSTRUMENT'], df_metadata['LATITUDE'].astype('float64')), - 'NOMINAL_DEPTH': (['INSTRUMENT'], df_metadata['NOMINAL_DEPTH'].astype('float32')), - 'instrument_id': (['INSTRUMENT'], df_metadata['instrument_id'].astype('|S256')), - 'source_file': (['INSTRUMENT'], df_metadata['source_file'].astype('|S256'))}) + nc_metadata = xr.Dataset( + { + "LONGITUDE": (["INSTRUMENT"], df_metadata["LONGITUDE"].astype("float64")), + "LATITUDE": (["INSTRUMENT"], df_metadata["LATITUDE"].astype("float64")), + "NOMINAL_DEPTH": ( + ["INSTRUMENT"], + df_metadata["NOMINAL_DEPTH"].astype("float32"), + ), + "instrument_id": ( + ["INSTRUMENT"], + df_metadata["instrument_id"].astype("|S256"), + ), + "source_file": (["INSTRUMENT"], df_metadata["source_file"].astype("|S256")), + } + ) ## Check and drop all nan columns column_remove_list = [] @@ -443,71 +570,98 @@ def hourly_aggregator(files_to_aggregate, site_code, qcflags, input_dir='', outp column_remove_list.append(parameter) parameter_remove_list.append(parameter) for method in function_stats: - column_remove_list.append(parameter+'_'+method) + column_remove_list.append(parameter + "_" + method) df_data.drop(columns=column_remove_list, inplace=True) ## remove the drop names from the parameter_names_all list parameter_names_all = list(set(parameter_names_all) - set(parameter_remove_list)) nc_data = xr.Dataset.from_dataframe(df_data) nc_aggregated = xr.merge([nc_metadata, nc_data]) - nc_aggregated = nc_aggregated.drop('OBSERVATION') + nc_aggregated = nc_aggregated.drop("OBSERVATION") ## add global attributes - add_attribute = {'rejected_files': "\n".join(list(bad_files)), - 'included_values_flagged_as': ", ".join([qcflags_names[flag] for flag in qcflags]), - 'generating_code_version': __version__ - } - add_attribute.update(utils.get_contributors(files_to_aggregate, input_dir=input_dir)) - - nc_aggregated.attrs = set_globalattr(nc_aggregated, TEMPLATE_JSON, site_code, add_attribute, parameter_names) - nc_aggregated.attrs['abstract'] = nc_aggregated.attrs['abstract'].format( - site_code=site_code, - flags=", ".join(qcflags_names[flag] for flag in qcflags) + add_attribute = { + "rejected_files": "\n".join(list(bad_files)), + "included_values_flagged_as": ", ".join( + [qcflags_names[flag] for flag in qcflags] + ), + "generating_code_version": __version__, + } + add_attribute.update( + utils.get_contributors(files_to_aggregate, input_dir=input_dir) + ) + + nc_aggregated.attrs = set_globalattr( + nc_aggregated, TEMPLATE_JSON, site_code, add_attribute, parameter_names + ) + nc_aggregated.attrs["abstract"] = nc_aggregated.attrs["abstract"].format( + site_code=site_code, flags=", ".join(qcflags_names[flag] for flag in qcflags) ) if 0 in qcflags: - nc_aggregated.attrs['lineage'] += ('The percentage of quality controlled values used in the aggregation is ' - 'indicated in the percent_quality_controlled variable attribute.') - github_comment = ('\nThis file was created using https://github.com/aodn/python-aodntools/blob/' - '{v}/aodntools/timeseries_products/hourly_timeseries.py'.format(v=__version__) - ) - nc_aggregated.attrs['lineage'] += github_comment + nc_aggregated.attrs["lineage"] += ( + "The percentage of quality controlled values used in the aggregation is " + "indicated in the percent_quality_controlled variable attribute." + ) + github_comment = ( + "\nThis file was created using https://github.com/aodn/python-aodntools/blob/" + "{v}/aodntools/timeseries_products/hourly_timeseries.py".format(v=__version__) + ) + nc_aggregated.attrs["lineage"] += github_comment ## add variable attributes - variablenames_others = ['TIME', 'LONGITUDE', 'LATITUDE', 'NOMINAL_DEPTH', - 'instrument_index', 'instrument_id', 'source_file'] + variablenames_others = [ + "TIME", + "LONGITUDE", + "LATITUDE", + "NOMINAL_DEPTH", + "instrument_index", + "instrument_id", + "source_file", + ] parameter_names_all = list(set(parameter_names_all)) variable_attributes = variable_attribute_dictionary - variable_attributes['PRES_REL'].update({'applied_offset_by_instrument': applied_offset}) + variable_attributes["PRES_REL"].update( + {"applied_offset_by_instrument": applied_offset} + ) if download_url_prefix or opendap_url_prefix: - variable_attributes['source_file'].update(utils.source_file_attributes(download_url_prefix, opendap_url_prefix)) - - time_units = variable_attributes['TIME'].pop('units') - time_calendar = variable_attributes['TIME'].pop('calendar') - encoding = {'TIME': {'_FillValue': None, - 'units': time_units, - 'calendar': time_calendar}, - 'LONGITUDE': {'_FillValue': None}, - 'LATITUDE': {'_FillValue': None}, - 'NOMINAL_DEPTH': {'_FillValue': None}, - 'instrument_id': {'dtype': '|S256'}, - 'source_file': {'dtype': '|S256'}} - - -## add attributes to TIME, LAT/LON, and index variables + variable_attributes["source_file"].update( + utils.source_file_attributes(download_url_prefix, opendap_url_prefix) + ) + + time_units = variable_attributes["TIME"].pop("units") + time_calendar = variable_attributes["TIME"].pop("calendar") + encoding = { + "TIME": {"_FillValue": None, "units": time_units, "calendar": time_calendar}, + "LONGITUDE": {"_FillValue": None}, + "LATITUDE": {"_FillValue": None}, + "NOMINAL_DEPTH": {"_FillValue": None}, + "instrument_id": {"dtype": "|S256"}, + "source_file": {"dtype": "|S256"}, + } + + ## add attributes to TIME, LAT/LON, and index variables for variable in variablenames_others: nc_aggregated[variable].attrs = variable_attributes[variable] for variable in parameter_names_all: ancillary_variables_attr = [] ## remove the _FillValue attribute - fill_value = variable_attributes[variable].pop('_FillValue') - encoding.update({variable: {'_FillValue': fill_value}}) + fill_value = variable_attributes[variable].pop("_FillValue") + encoding.update({variable: {"_FillValue": fill_value}}) ## replace nan by FillValue nc_aggregated[variable] = nc_aggregated[variable].fillna(fill_value) nc_aggregated[variable].attrs = variable_attributes[variable] - nc_aggregated[variable].attrs['long_name'] = function_dict[variable] + " " + nc_aggregated[variable].attrs['long_name'] - nc_aggregated[variable].attrs.update({'cell_methods': 'TIME:' + function_dict[variable] + ' (interval: 1 hr comment: time mid point)'}) + nc_aggregated[variable].attrs["long_name"] = ( + function_dict[variable] + " " + nc_aggregated[variable].attrs["long_name"] + ) + nc_aggregated[variable].attrs.update( + { + "cell_methods": "TIME:" + + function_dict[variable] + + " (interval: 1 hr comment: time mid point)" + } + ) ## add percent of QCed values if qc_proportion_all: @@ -516,54 +670,103 @@ def hourly_aggregator(files_to_aggregate, site_code, qcflags, input_dir='', outp for stat_method in function_stats: variable_stat_name = variable + "_" + stat_method ancillary_variables_attr += [variable_stat_name] - if stat_method == 'count': - nc_aggregated[variable_stat_name].attrs['units'] = "1" + if stat_method == "count": + nc_aggregated[variable_stat_name].attrs["units"] = "1" else: - nc_aggregated[variable_stat_name].attrs['units'] = variable_attributes[variable]['units'] - - if 'standard_name' in nc_aggregated[variable].attrs: - nc_aggregated[variable_stat_name].attrs['standard_name'] = nc_aggregated[variable].attrs['standard_name'] - nc_aggregated[variable+'_count'].attrs['standard_name'] = nc_aggregated[variable].attrs['standard_name'] + ' number_of_observations' - - nc_aggregated[variable_stat_name].attrs['long_name'] = stat_method + ' data value in the bin, after rejection of flagged data' - nc_aggregated[variable_stat_name].attrs['cell_methods'] = 'TIME:' + stat_method - nc_aggregated[variable_stat_name].attrs['_FillValue'] = fill_value - nc_aggregated[variable_stat_name] = nc_aggregated[variable_stat_name].fillna(fill_value) - nc_aggregated[variable].attrs.update({'ancillary_variables': " ".join(ancillary_variables_attr)}) - + nc_aggregated[variable_stat_name].attrs["units"] = variable_attributes[ + variable + ]["units"] + + if "standard_name" in nc_aggregated[variable].attrs: + nc_aggregated[variable_stat_name].attrs[ + "standard_name" + ] = nc_aggregated[variable].attrs["standard_name"] + nc_aggregated[variable + "_count"].attrs["standard_name"] = ( + nc_aggregated[variable].attrs["standard_name"] + + " number_of_observations" + ) + + nc_aggregated[variable_stat_name].attrs["long_name"] = ( + stat_method + " data value in the bin, after rejection of flagged data" + ) + nc_aggregated[variable_stat_name].attrs["cell_methods"] = ( + "TIME:" + stat_method + ) + nc_aggregated[variable_stat_name].attrs["_FillValue"] = fill_value + nc_aggregated[variable_stat_name] = nc_aggregated[ + variable_stat_name + ].fillna(fill_value) + nc_aggregated[variable].attrs.update( + {"ancillary_variables": " ".join(ancillary_variables_attr)} + ) ## create the output file name and write the aggregated product as netCDF facility_code = get_facility_code(os.path.join(input_dir, files_to_aggregate[0])) data_code = "".join(sorted(set(get_data_code(p) for p in parameter_names_all))) if 0 in qcflags: - product_type = 'hourly-timeseries-including-non-QC' + product_type = "hourly-timeseries-including-non-QC" else: - product_type = 'hourly-timeseries' + product_type = "hourly-timeseries" file_version = 2 - ncout_filename = generate_netcdf_output_filename(nc=nc_aggregated, facility_code=facility_code, data_code=data_code, - site_code=site_code, - product_type=product_type, file_version=file_version) + ncout_filename = generate_netcdf_output_filename( + nc=nc_aggregated, + facility_code=facility_code, + data_code=data_code, + site_code=site_code, + product_type=product_type, + file_version=file_version, + ) ncout_path = os.path.join(output_dir, ncout_filename) write_netCDF_aggfile(nc_aggregated, ncout_path, encoding) - return ncout_path, bad_files if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Concatenate ALL variables from ALL instruments from ALL deployments from ONE site at 1hr time bin") - parser.add_argument('-site', dest='site_code', help='site code, like NRMMAI', required=True) - parser.add_argument('-files', dest='filenames', help='name of the file that contains the source URLs', required=True) - parser.add_argument('-qc', dest='qcflags', help='list of QC flags to select variable values to keep', nargs='+', required=True) - parser.add_argument('-indir', dest='input_dir', help='base path of input files', default='', required=False) - parser.add_argument('-outdir', dest='output_dir', help='path where the result file will be written. Default ./', - default='./', required=False) + parser = argparse.ArgumentParser( + description="Concatenate ALL variables from ALL instruments from ALL deployments from ONE site at 1hr time bin" + ) + parser.add_argument( + "-site", dest="site_code", help="site code, like NRMMAI", required=True + ) + parser.add_argument( + "-files", + dest="filenames", + help="name of the file that contains the source URLs", + required=True, + ) + parser.add_argument( + "-qc", + dest="qcflags", + help="list of QC flags to select variable values to keep", + nargs="+", + required=True, + ) + parser.add_argument( + "-indir", + dest="input_dir", + help="base path of input files", + default="", + required=False, + ) + parser.add_argument( + "-outdir", + dest="output_dir", + help="path where the result file will be written. Default ./", + default="./", + required=False, + ) args = parser.parse_args() - with open(args.filenames, 'r') as file: + with open(args.filenames, "r") as file: files_to_aggregate = [i.strip() for i in file.readlines()] qcflags = [int(i) for i in args.qcflags] - hourly_aggregator(files_to_aggregate=files_to_aggregate, site_code=args.site_code, qcflags=qcflags, - input_dir=args.input_dir, output_dir=args.output_dir) + hourly_aggregator( + files_to_aggregate=files_to_aggregate, + site_code=args.site_code, + qcflags=qcflags, + input_dir=args.input_dir, + output_dir=args.output_dir, + ) diff --git a/aodntools/timeseries_products/velocity_aggregated_timeseries.py b/aodntools/timeseries_products/velocity_aggregated_timeseries.py index 2a900cc..4b88e96 100644 --- a/aodntools/timeseries_products/velocity_aggregated_timeseries.py +++ b/aodntools/timeseries_products/velocity_aggregated_timeseries.py @@ -3,7 +3,7 @@ import shutil from copy import deepcopy -from netCDF4 import Dataset, num2date, stringtochar +from netCDF4 import Dataset, num2date, stringtochar import numpy as np import json import argparse @@ -13,10 +13,17 @@ import xarray as xr from aodntools.timeseries_products import aggregated_timeseries as utils -from aodntools.timeseries_products.common import (NoInputFilesError, check_velocity_file, current_utc_timestamp, - TIMESTAMP_FORMAT, DATESTAMP_FORMAT) +from aodntools.timeseries_products.common import ( + NoInputFilesError, + check_velocity_file, + current_utc_timestamp, + TIMESTAMP_FORMAT, + DATESTAMP_FORMAT, +) -TEMPLATE_JSON = resource_filename(__name__, 'velocity_aggregated_timeseries_template.json') +TEMPLATE_JSON = resource_filename( + __name__, "velocity_aggregated_timeseries_template.json" +) def get_number_flatvalues(nc): @@ -25,12 +32,12 @@ def get_number_flatvalues(nc): :param nc: xarray dataset :return: number of values, number of cells above the sensor """ - if 'HEIGHT_ABOVE_SENSOR' in nc.dims: - n_cells = nc.dims['HEIGHT_ABOVE_SENSOR'] - n_flatt_values = nc.dims['TIME'] * n_cells + if "HEIGHT_ABOVE_SENSOR" in nc.dims: + n_cells = nc.dims["HEIGHT_ABOVE_SENSOR"] + n_flatt_values = nc.dims["TIME"] * n_cells else: n_cells = 1 - n_flatt_values = nc.dims['TIME'] + n_flatt_values = nc.dims["TIME"] return n_flatt_values, n_cells @@ -44,11 +51,15 @@ def flat_variable(nc, varname): return nc[varname].values.flatten() - - ## MAIN FUNCTION -def velocity_aggregated(files_to_agg, site_code, input_dir='', output_dir='./', - download_url_prefix=None, opendap_url_prefix=None): +def velocity_aggregated( + files_to_agg, + site_code, + input_dir="", + output_dir="./", + download_url_prefix=None, + opendap_url_prefix=None, +): """ Aggregate U, V and W CUR variables from all deployments at one site. the vertical cells are flattened and related to its depth @@ -62,16 +73,16 @@ def velocity_aggregated(files_to_agg, site_code, input_dir='', output_dir='./', :return: file path of the aggregated product, dict of rejected files: errors """ - varlist = ['UCUR', 'VCUR', 'WCUR', 'DEPTH'] - time_units="days since 1950-01-01 00:00:00 UTC" - time_calendar="gregorian" + varlist = ["UCUR", "VCUR", "WCUR", "DEPTH"] + time_units = "days since 1950-01-01 00:00:00 UTC" + time_calendar = "gregorian" epoch = np.datetime64("1950-01-01T00:00:00") - one_day = np.timedelta64(1, 'D') + one_day = np.timedelta64(1, "D") bad_files = {} # default name for temporary file. It will be renamed at the end - _, temp_outfile = tempfile.mkstemp(suffix='.nc', dir=output_dir) + _, temp_outfile = tempfile.mkstemp(suffix=".nc", dir=output_dir) ## check files and get total number of flattened obs n_obs_total = 0 @@ -95,37 +106,58 @@ def velocity_aggregated(files_to_agg, site_code, input_dir='', output_dir='./', n_files = len(files_to_agg) ## create ncdf file, dimensions and variables - ds = Dataset(os.path.join(output_dir, temp_outfile), 'w', format="NETCDF4_CLASSIC") - OBSERVATION = ds.createDimension('OBSERVATION', size=n_obs_total) - INSTRUMENT = ds.createDimension('INSTRUMENT', size=n_files) + ds = Dataset(os.path.join(output_dir, temp_outfile), "w", format="NETCDF4_CLASSIC") + OBSERVATION = ds.createDimension("OBSERVATION", size=n_obs_total) + INSTRUMENT = ds.createDimension("INSTRUMENT", size=n_files) STRING256 = ds.createDimension("strlen", 256) - obs_double_template = {'datatype': np.float64, 'zlib': True, 'dimensions': ('OBSERVATION'), "fill_value": 99999.0} - obs_float_template = {'datatype': np.float32, 'zlib': True, 'dimensions': ('OBSERVATION'), "fill_value": 99999.0} - obs_byte_template = {'datatype': np.byte, 'zlib': True, 'dimensions': ('OBSERVATION'), 'fill_value': 99} - obs_int_template = {'datatype': np.int16, 'zlib': True, 'dimensions': ('OBSERVATION')} - inst_S256_template = {'datatype': 'S1', 'dimensions': ('INSTRUMENT', "strlen")} - inst_float_template ={'datatype': np.float32, 'dimensions': ('INSTRUMENT')} - inst_double_template ={'datatype': np.float64, 'dimensions': ('INSTRUMENT')} - - UCUR = ds.createVariable(varname='UCUR', **obs_float_template) - VCUR = ds.createVariable(varname='VCUR', **obs_float_template) - WCUR = ds.createVariable(varname='WCUR', **obs_float_template) - DEPTH = ds.createVariable(varname='DEPTH', **obs_float_template) - UCURqc = ds.createVariable(varname='UCUR_quality_control', **obs_byte_template) - VCURqc = ds.createVariable(varname='VCUR_quality_control', **obs_byte_template) - WCURqc = ds.createVariable(varname='WCUR_quality_control', **obs_byte_template) - DEPTHqc = ds.createVariable(varname='DEPTH_quality_control', **obs_byte_template) - TIME = ds.createVariable(varname='TIME', **obs_double_template) - instrument_index = ds.createVariable(varname='instrument_index', **obs_int_template) - - source_file = ds.createVariable(varname='source_file', **inst_S256_template) - instrument_id = ds.createVariable(varname='instrument_id', **inst_S256_template) - LATITUDE = ds.createVariable(varname='LATITUDE', **inst_double_template) - LONGITUDE = ds.createVariable(varname='LONGITUDE', **inst_double_template) - NOMINAL_DEPTH = ds.createVariable(varname='NOMINAL_DEPTH', **inst_float_template) - SECONDS_TO_MIDDLE = ds.createVariable(varname='SECONDS_TO_MIDDLE', **inst_float_template) - CELL_INDEX = ds.createVariable(varname='CELL_INDEX', **obs_int_template) + obs_double_template = { + "datatype": np.float64, + "zlib": True, + "dimensions": ("OBSERVATION"), + "fill_value": 99999.0, + } + obs_float_template = { + "datatype": np.float32, + "zlib": True, + "dimensions": ("OBSERVATION"), + "fill_value": 99999.0, + } + obs_byte_template = { + "datatype": np.byte, + "zlib": True, + "dimensions": ("OBSERVATION"), + "fill_value": 99, + } + obs_int_template = { + "datatype": np.int16, + "zlib": True, + "dimensions": ("OBSERVATION"), + } + inst_S256_template = {"datatype": "S1", "dimensions": ("INSTRUMENT", "strlen")} + inst_float_template = {"datatype": np.float32, "dimensions": ("INSTRUMENT")} + inst_double_template = {"datatype": np.float64, "dimensions": ("INSTRUMENT")} + + UCUR = ds.createVariable(varname="UCUR", **obs_float_template) + VCUR = ds.createVariable(varname="VCUR", **obs_float_template) + WCUR = ds.createVariable(varname="WCUR", **obs_float_template) + DEPTH = ds.createVariable(varname="DEPTH", **obs_float_template) + UCURqc = ds.createVariable(varname="UCUR_quality_control", **obs_byte_template) + VCURqc = ds.createVariable(varname="VCUR_quality_control", **obs_byte_template) + WCURqc = ds.createVariable(varname="WCUR_quality_control", **obs_byte_template) + DEPTHqc = ds.createVariable(varname="DEPTH_quality_control", **obs_byte_template) + TIME = ds.createVariable(varname="TIME", **obs_double_template) + instrument_index = ds.createVariable(varname="instrument_index", **obs_int_template) + + source_file = ds.createVariable(varname="source_file", **inst_S256_template) + instrument_id = ds.createVariable(varname="instrument_id", **inst_S256_template) + LATITUDE = ds.createVariable(varname="LATITUDE", **inst_double_template) + LONGITUDE = ds.createVariable(varname="LONGITUDE", **inst_double_template) + NOMINAL_DEPTH = ds.createVariable(varname="NOMINAL_DEPTH", **inst_float_template) + SECONDS_TO_MIDDLE = ds.createVariable( + varname="SECONDS_TO_MIDDLE", **inst_float_template + ) + CELL_INDEX = ds.createVariable(varname="CELL_INDEX", **obs_int_template) ## main loop start = 0 @@ -139,22 +171,26 @@ def velocity_aggregated(files_to_agg, site_code, input_dir='', output_dir='./', n_obs, n_cells = get_number_flatvalues(nc) end = start + n_obs - UCUR[start:end] = flat_variable(nc, 'UCUR') - UCURqc[start:end] = flat_variable(nc, 'UCUR_quality_control') - VCUR[start:end] = flat_variable(nc, 'VCUR') - VCURqc[start:end] = flat_variable(nc, 'VCUR_quality_control') - if 'WCUR' in nc.data_vars: - WCUR[start:end] = flat_variable(nc, 'WCUR') - WCURqc[start:end] = flat_variable(nc, 'WCUR_quality_control') + UCUR[start:end] = flat_variable(nc, "UCUR") + UCURqc[start:end] = flat_variable(nc, "UCUR_quality_control") + VCUR[start:end] = flat_variable(nc, "VCUR") + VCURqc[start:end] = flat_variable(nc, "VCUR_quality_control") + if "WCUR" in nc.data_vars: + WCUR[start:end] = flat_variable(nc, "WCUR") + WCURqc[start:end] = flat_variable(nc, "WCUR_quality_control") else: WCUR[start:end] = np.ma.masked WCURqc[start:end] = np.full(n_obs, 9) ##calculate depth and add CELL_INDEX - if 'HEIGHT_ABOVE_SENSOR' in nc.dims: + if "HEIGHT_ABOVE_SENSOR" in nc.dims: DEPTH[start:end] = (nc.DEPTH - nc.HEIGHT_ABOVE_SENSOR).values.flatten() - DEPTHqc[start:end] = np.repeat(flat_variable(nc, 'DEPTH_quality_control'), n_cells) - CELL_INDEX[start:end] = np.tile(np.arange(n_cells, dtype=np.uint32), n_measurements) + DEPTHqc[start:end] = np.repeat( + flat_variable(nc, "DEPTH_quality_control"), n_cells + ) + CELL_INDEX[start:end] = np.tile( + np.arange(n_cells, dtype=np.uint32), n_measurements + ) else: DEPTH[start:end] = nc.DEPTH.values @@ -162,16 +198,20 @@ def velocity_aggregated(files_to_agg, site_code, input_dir='', output_dir='./', CELL_INDEX[start:end] = np.full(n_obs, 0, dtype=np.uint32) ## set TIME and instrument index - TIME[start:end] = (np.repeat(flat_variable(nc, 'TIME'), n_cells) - epoch) / one_day + TIME[start:end] = ( + np.repeat(flat_variable(nc, "TIME"), n_cells) - epoch + ) / one_day instrument_index[start:end] = np.repeat(index, n_obs) ## get and store deployment metadata LATITUDE[index] = nc.LATITUDE.values LONGITUDE[index] = nc.LONGITUDE.values NOMINAL_DEPTH[index] = utils.get_nominal_depth(nc) - source_file[index] = stringtochar(np.array(file, dtype='S256')) - instrument_id[index] = stringtochar(np.array(utils.get_instrument_id(nc), dtype='S256')) + source_file[index] = stringtochar(np.array(file, dtype="S256")) + instrument_id[index] = stringtochar( + np.array(utils.get_instrument_id(nc), dtype="S256") + ) ## add time offset to the middle of the measuring window, if it exists - if 'seconds_to_middle_of_measurement' in nc.TIME.attrs: + if "seconds_to_middle_of_measurement" in nc.TIME.attrs: SECONDS_TO_MIDDLE[index] = nc.TIME.seconds_to_middle_of_measurement else: SECONDS_TO_MIDDLE[index] = np.nan @@ -180,87 +220,158 @@ def velocity_aggregated(files_to_agg, site_code, input_dir='', output_dir='./', ## add atributes with open(TEMPLATE_JSON) as json_file: attribute_dictionary = json.load(json_file) - variable_attribute_dictionary = attribute_dictionary['_variables'] - global_attribute_dictionary = attribute_dictionary['_global'] + variable_attribute_dictionary = attribute_dictionary["_variables"] + global_attribute_dictionary = attribute_dictionary["_global"] ## set variable attrs for var in list(ds.variables): ds[var].setncatts(variable_attribute_dictionary[var]) if download_url_prefix or opendap_url_prefix: - ds['source_file'].setncatts(utils.source_file_attributes(download_url_prefix, opendap_url_prefix)) + ds["source_file"].setncatts( + utils.source_file_attributes(download_url_prefix, opendap_url_prefix) + ) ## set global attrs - time_start = num2date(np.min(TIME[:]), time_units, time_calendar).strftime(TIMESTAMP_FORMAT) - time_end = num2date(np.max(TIME[:]), time_units, time_calendar).strftime(TIMESTAMP_FORMAT) - time_start_filename = num2date(np.min(TIME[:]), time_units, time_calendar).strftime(DATESTAMP_FORMAT) - time_end_filename = num2date(np.max(TIME[:]), time_units, time_calendar).strftime(DATESTAMP_FORMAT) + time_start = num2date(np.min(TIME[:]), time_units, time_calendar).strftime( + TIMESTAMP_FORMAT + ) + time_end = num2date(np.max(TIME[:]), time_units, time_calendar).strftime( + TIMESTAMP_FORMAT + ) + time_start_filename = num2date(np.min(TIME[:]), time_units, time_calendar).strftime( + DATESTAMP_FORMAT + ) + time_end_filename = num2date(np.max(TIME[:]), time_units, time_calendar).strftime( + DATESTAMP_FORMAT + ) add_attribute = { - 'title': ("Long Timeseries Velocity Aggregated product: " + ', '.join(varlist) + " at " + - site_code + " between " + time_start + " and " + time_end), - 'site_code': site_code, - 'time_coverage_start': time_start, - 'time_coverage_end': time_end, - 'geospatial_vertical_min': np.min(ds['DEPTH']), - 'geospatial_vertical_max': np.max(ds['DEPTH']), - 'geospatial_lat_min': np.min(ds['LATITUDE']), - 'geospatial_lat_max': np.max(ds['LATITUDE']), - 'geospatial_lon_min': np.min(ds['LONGITUDE']), - 'geospatial_lon_max': np.max(ds['LONGITUDE']), - 'date_created': current_utc_timestamp(), - 'history': current_utc_timestamp() + ': Aggregated file created.', - 'keywords': ', '.join(varlist + ['AGGREGATED']), - 'rejected_files': "\n".join(bad_files.keys()), - 'generating_code_version': __version__ + "title": ( + "Long Timeseries Velocity Aggregated product: " + + ", ".join(varlist) + + " at " + + site_code + + " between " + + time_start + + " and " + + time_end + ), + "site_code": site_code, + "time_coverage_start": time_start, + "time_coverage_end": time_end, + "geospatial_vertical_min": np.min(ds["DEPTH"]), + "geospatial_vertical_max": np.max(ds["DEPTH"]), + "geospatial_lat_min": np.min(ds["LATITUDE"]), + "geospatial_lat_max": np.max(ds["LATITUDE"]), + "geospatial_lon_min": np.min(ds["LONGITUDE"]), + "geospatial_lon_max": np.max(ds["LONGITUDE"]), + "date_created": current_utc_timestamp(), + "history": current_utc_timestamp() + ": Aggregated file created.", + "keywords": ", ".join(varlist + ["AGGREGATED"]), + "rejected_files": "\n".join(bad_files.keys()), + "generating_code_version": __version__, } - add_attribute.update(utils.get_contributors(files_to_agg=files_to_agg, input_dir=input_dir)) + add_attribute.update( + utils.get_contributors(files_to_agg=files_to_agg, input_dir=input_dir) + ) ## add version - github_comment = ('\nThis file was created using https://github.com/aodn/python-aodntools/blob/' - '{v}/aodntools/timeseries_products/{f}'.format(v=__version__, f=os.path.basename(__file__)) - ) - global_attribute_dictionary['lineage'] += github_comment + github_comment = ( + "\nThis file was created using https://github.com/aodn/python-aodntools/blob/" + "{v}/aodntools/timeseries_products/{f}".format( + v=__version__, f=os.path.basename(__file__) + ) + ) + global_attribute_dictionary["lineage"] += github_comment global_attribute_dictionary.update(add_attribute) ds.setncatts(dict(sorted(global_attribute_dictionary.items()))) ds.close() - ## create the output file name and rename the tmp file facility_code = utils.get_facility_code(os.path.join(input_dir, files_to_agg[0])) - data_code = 'VZ' - product_type = 'aggregated-timeseries' + data_code = "VZ" + product_type = "aggregated-timeseries" file_version = 1 - output_name = '_'.join(['IMOS', facility_code, data_code, time_start_filename, site_code, ('FV0'+str(file_version)), - ("velocity-"+product_type), - ('END-'+ time_end_filename), 'C-' + current_utc_timestamp(DATESTAMP_FORMAT)]) + '.nc' + output_name = ( + "_".join( + [ + "IMOS", + facility_code, + data_code, + time_start_filename, + site_code, + ("FV0" + str(file_version)), + ("velocity-" + product_type), + ("END-" + time_end_filename), + "C-" + current_utc_timestamp(DATESTAMP_FORMAT), + ] + ) + + ".nc" + ) ncout_path = os.path.join(output_dir, output_name) shutil.move(temp_outfile, ncout_path) - return ncout_path, bad_files if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Concatenate X,Y,Z velocity variables from ALL instruments from ALL deployments from ONE site") - parser.add_argument('-site', dest='site_code', help='site code, like NRMMAI', required=True) - parser.add_argument('-files', dest='filenames', help='name of the file that contains the source URLs', required=True) - parser.add_argument('-indir', dest='input_dir', help='base path of input files', default='', required=False) - parser.add_argument('-outdir', dest='output_dir', help='path where the result file will be written. Default ./', - default='./', required=False) - parser.add_argument('-download_url', dest='download_url', help='path to the download_url_prefix', - default='', required=False) - parser.add_argument('-opendap_url', dest='opendap_url', help='path to the opendap_url_prefix', - default='', required=False) + parser = argparse.ArgumentParser( + description="Concatenate X,Y,Z velocity variables from ALL instruments from ALL deployments from ONE site" + ) + parser.add_argument( + "-site", dest="site_code", help="site code, like NRMMAI", required=True + ) + parser.add_argument( + "-files", + dest="filenames", + help="name of the file that contains the source URLs", + required=True, + ) + parser.add_argument( + "-indir", + dest="input_dir", + help="base path of input files", + default="", + required=False, + ) + parser.add_argument( + "-outdir", + dest="output_dir", + help="path where the result file will be written. Default ./", + default="./", + required=False, + ) + parser.add_argument( + "-download_url", + dest="download_url", + help="path to the download_url_prefix", + default="", + required=False, + ) + parser.add_argument( + "-opendap_url", + dest="opendap_url", + help="path to the opendap_url_prefix", + default="", + required=False, + ) args = parser.parse_args() with open(args.filenames) as ff: files_to_agg = [line.rstrip() for line in ff] - print(velocity_aggregated(files_to_agg=files_to_agg, site_code=args.site_code, - input_dir=args.input_dir, output_dir=args.output_dir, - download_url_prefix=args.download_url, opendap_url_prefix=args.opendap_url)) + print( + velocity_aggregated( + files_to_agg=files_to_agg, + site_code=args.site_code, + input_dir=args.input_dir, + output_dir=args.output_dir, + download_url_prefix=args.download_url, + opendap_url_prefix=args.opendap_url, + ) + ) diff --git a/aodntools/timeseries_products/velocity_hourly_timeseries.py b/aodntools/timeseries_products/velocity_hourly_timeseries.py index 765196a..2bc5193 100644 --- a/aodntools/timeseries_products/velocity_hourly_timeseries.py +++ b/aodntools/timeseries_products/velocity_hourly_timeseries.py @@ -13,15 +13,20 @@ import aodntools.timeseries_products.aggregated_timeseries as utils from aodntools import __version__ -from aodntools.timeseries_products.common import (NoInputFilesError, check_velocity_file, current_utc_timestamp, - TIMESTAMP_FORMAT, DATESTAMP_FORMAT) - -TEMPLATE_JSON = resource_filename(__name__, 'velocity_hourly_timeseries_template.json') +from aodntools.timeseries_products.common import ( + NoInputFilesError, + check_velocity_file, + current_utc_timestamp, + TIMESTAMP_FORMAT, + DATESTAMP_FORMAT, +) + +TEMPLATE_JSON = resource_filename(__name__, "velocity_hourly_timeseries_template.json") QC_FLAG_MAX = 2 TIME_UNITS = "days since 1950-01-01 00:00:00 UTC" TIME_CALENDAR = "gregorian" TIME_EPOCH = np.datetime64("1950-01-01T00:00:00") -ONE_DAY = np.timedelta64(1, 'D') +ONE_DAY = np.timedelta64(1, "D") def cell_velocity_resample(df, binning_function): @@ -34,7 +39,7 @@ def cell_velocity_resample(df, binning_function): """ df_binned = df.apply(binning_function) binned_vars = [] - for var in ('UCUR', 'VCUR', 'WCUR', 'DEPTH'): + for var in ("UCUR", "VCUR", "WCUR", "DEPTH"): if var in df_binned: x = np.ma.masked_array(df_binned[var], mask=np.isnan(df_binned[var])) else: @@ -43,6 +48,7 @@ def cell_velocity_resample(df, binning_function): return tuple(binned_vars) + def append_resampled_values(nc_cell, ds, slice_start, binning_functions): """ Resample U, V, W current and depth values from a single ADCP cell into hourly bins, and @@ -54,36 +60,47 @@ def append_resampled_values(nc_cell, ds, slice_start, binning_functions): :param binning_functions: list of numpy function names for binning :return: end index of the slice """ - df_cell = nc_cell.to_dataframe().reset_index().set_index('TIME') + df_cell = nc_cell.to_dataframe().reset_index().set_index("TIME") # shift the index forward 30min to centre the bins on the hour df_cell.index = df_cell.index + pd.Timedelta(minutes=30) - df_cell_1H = df_cell.resample('1H') + df_cell_1H = df_cell.resample("1H") slice_end = len(df_cell_1H) + slice_start # set binned timestamps - time_slice = (np.fromiter(df_cell_1H.min().index, dtype='M8[ns]') - TIME_EPOCH) / ONE_DAY - ds['TIME'][slice_start:slice_end] = time_slice + time_slice = ( + np.fromiter(df_cell_1H.min().index, dtype="M8[ns]") - TIME_EPOCH + ) / ONE_DAY + ds["TIME"][slice_start:slice_end] = time_slice # take the mean of the variables - ds['UCUR'][slice_start:slice_end], \ - ds['VCUR'][slice_start:slice_end], \ - ds['WCUR'][slice_start:slice_end], \ - ds['DEPTH'][slice_start:slice_end] = cell_velocity_resample(df_cell_1H, 'mean') + ( + ds["UCUR"][slice_start:slice_end], + ds["VCUR"][slice_start:slice_end], + ds["WCUR"][slice_start:slice_end], + ds["DEPTH"][slice_start:slice_end], + ) = cell_velocity_resample(df_cell_1H, "mean") for method in binning_functions: - ds['UCUR_' + method][slice_start:slice_end], \ - ds['VCUR_' + method][slice_start:slice_end], \ - ds['WCUR_' + method][slice_start:slice_end], \ - ds['DEPTH_' + method][slice_start:slice_end] = cell_velocity_resample(df_cell_1H, method) + ( + ds["UCUR_" + method][slice_start:slice_end], + ds["VCUR_" + method][slice_start:slice_end], + ds["WCUR_" + method][slice_start:slice_end], + ds["DEPTH_" + method][slice_start:slice_end], + ) = cell_velocity_resample(df_cell_1H, method) return slice_end - ## MAIN FUNCTION -def velocity_hourly_aggregated(files_to_agg, site_code, input_dir='', output_dir='./', - download_url_prefix=None, opendap_url_prefix=None): +def velocity_hourly_aggregated( + files_to_agg, + site_code, + input_dir="", + output_dir="./", + download_url_prefix=None, + opendap_url_prefix=None, +): """ Aggregate U, V and W CUR variables from the given files (from the same site) and average into hourly bins. The vertical cells are flattened and the actual depth of each is calculated. @@ -97,20 +114,20 @@ def velocity_hourly_aggregated(files_to_agg, site_code, input_dir='', output_dir :return: file path of the hourly aggregated product, dict of rejected files: errors """ - varlist = ['UCUR', 'VCUR', 'WCUR', 'DEPTH'] - binning_fun = ['max', 'min', 'std', 'count'] + varlist = ["UCUR", "VCUR", "WCUR", "DEPTH"] + binning_fun = ["max", "min", "std", "count"] bad_files = {} chunk_size = 90 ## size in days ## default name for temporary file. It will be renamed at the end - _, temp_outfile = tempfile.mkstemp(suffix='.nc', dir=output_dir) + _, temp_outfile = tempfile.mkstemp(suffix=".nc", dir=output_dir) ## check files and get total number of flattened obs print("CHECKING FILES...") for index, file in enumerate(files_to_agg): - print(index, end=',', flush=True) + print(index, end=",", flush=True) with xr.open_dataset(os.path.join(input_dir, file)) as nc: error_list = check_velocity_file(nc, site_code) if error_list: @@ -128,55 +145,69 @@ def velocity_hourly_aggregated(files_to_agg, site_code, input_dir='', output_dir files_to_agg = utils.sort_files(files_to_agg, input_dir=input_dir) ## create ncdf file, dimensions (unlimited) and variables - ds = Dataset(temp_outfile, 'w', format='NETCDF4_CLASSIC') - OBSERVATION = ds.createDimension('OBSERVATION', size=None) - INSTRUMENT = ds.createDimension('INSTRUMENT', size=len(files_to_agg)) + ds = Dataset(temp_outfile, "w", format="NETCDF4_CLASSIC") + OBSERVATION = ds.createDimension("OBSERVATION", size=None) + INSTRUMENT = ds.createDimension("INSTRUMENT", size=len(files_to_agg)) STRING256 = ds.createDimension("strlen", size=256) - obs_double_template = {'datatype': np.float64, 'zlib': True, 'dimensions': ('OBSERVATION'), "fill_value": 99999.0} - obs_float_template = {'datatype': np.float32, 'zlib': True, 'dimensions': ('OBSERVATION'), "fill_value": 99999.0} - obs_int_template = {'datatype': np.int16, 'zlib': True, 'dimensions': ('OBSERVATION')} - inst_S256_template = {'datatype': 'S1', 'dimensions': ('INSTRUMENT', "strlen")} - inst_float_template ={'datatype': np.float32, 'dimensions': ('INSTRUMENT')} - inst_double_template ={'datatype': np.float64, 'dimensions': ('INSTRUMENT')} - - UCUR = ds.createVariable(varname='UCUR', **obs_float_template) - UCUR_max = ds.createVariable(varname='UCUR_max', **obs_float_template) - UCUR_min = ds.createVariable(varname='UCUR_min', **obs_float_template) - UCUR_std = ds.createVariable(varname='UCUR_std', **obs_float_template) - UCUR_count = ds.createVariable(varname='UCUR_count', **obs_int_template) - VCUR = ds.createVariable(varname='VCUR', **obs_float_template) - VCUR_max = ds.createVariable(varname='VCUR_max', **obs_float_template) - VCUR_min = ds.createVariable(varname='VCUR_min', **obs_float_template) - VCUR_std = ds.createVariable(varname='VCUR_std', **obs_float_template) - VCUR_count = ds.createVariable(varname='VCUR_count', **obs_int_template) - WCUR = ds.createVariable(varname='WCUR', **obs_float_template) - WCUR_max = ds.createVariable(varname='WCUR_max', **obs_float_template) - WCUR_min = ds.createVariable(varname='WCUR_min', **obs_float_template) - WCUR_std = ds.createVariable(varname='WCUR_std', **obs_float_template) - WCUR_count = ds.createVariable(varname='WCUR_count', **obs_int_template) - - DEPTH = ds.createVariable(varname='DEPTH', **obs_float_template) - DEPTH_max = ds.createVariable(varname='DEPTH_max', **obs_float_template) - DEPTH_min = ds.createVariable(varname='DEPTH_min', **obs_float_template) - DEPTH_std = ds.createVariable(varname='DEPTH_std', **obs_float_template) - DEPTH_count = ds.createVariable(varname='DEPTH_count', **obs_int_template) - - TIME = ds.createVariable(varname='TIME', **obs_double_template) - instrument_index = ds.createVariable(varname='instrument_index', **obs_int_template) - - source_file = ds.createVariable(varname='source_file', **inst_S256_template) - instrument_id = ds.createVariable(varname='instrument_id', **inst_S256_template) - LATITUDE = ds.createVariable(varname='LATITUDE', **inst_double_template) - LONGITUDE = ds.createVariable(varname='LONGITUDE', **inst_double_template) - NOMINAL_DEPTH = ds.createVariable(varname='NOMINAL_DEPTH', **inst_float_template) - SECONDS_TO_MIDDLE = ds.createVariable(varname='SECONDS_TO_MIDDLE', **inst_float_template) - CELL_INDEX = ds.createVariable(varname='CELL_INDEX', **obs_int_template) - - + obs_double_template = { + "datatype": np.float64, + "zlib": True, + "dimensions": ("OBSERVATION"), + "fill_value": 99999.0, + } + obs_float_template = { + "datatype": np.float32, + "zlib": True, + "dimensions": ("OBSERVATION"), + "fill_value": 99999.0, + } + obs_int_template = { + "datatype": np.int16, + "zlib": True, + "dimensions": ("OBSERVATION"), + } + inst_S256_template = {"datatype": "S1", "dimensions": ("INSTRUMENT", "strlen")} + inst_float_template = {"datatype": np.float32, "dimensions": ("INSTRUMENT")} + inst_double_template = {"datatype": np.float64, "dimensions": ("INSTRUMENT")} + + UCUR = ds.createVariable(varname="UCUR", **obs_float_template) + UCUR_max = ds.createVariable(varname="UCUR_max", **obs_float_template) + UCUR_min = ds.createVariable(varname="UCUR_min", **obs_float_template) + UCUR_std = ds.createVariable(varname="UCUR_std", **obs_float_template) + UCUR_count = ds.createVariable(varname="UCUR_count", **obs_int_template) + VCUR = ds.createVariable(varname="VCUR", **obs_float_template) + VCUR_max = ds.createVariable(varname="VCUR_max", **obs_float_template) + VCUR_min = ds.createVariable(varname="VCUR_min", **obs_float_template) + VCUR_std = ds.createVariable(varname="VCUR_std", **obs_float_template) + VCUR_count = ds.createVariable(varname="VCUR_count", **obs_int_template) + WCUR = ds.createVariable(varname="WCUR", **obs_float_template) + WCUR_max = ds.createVariable(varname="WCUR_max", **obs_float_template) + WCUR_min = ds.createVariable(varname="WCUR_min", **obs_float_template) + WCUR_std = ds.createVariable(varname="WCUR_std", **obs_float_template) + WCUR_count = ds.createVariable(varname="WCUR_count", **obs_int_template) + + DEPTH = ds.createVariable(varname="DEPTH", **obs_float_template) + DEPTH_max = ds.createVariable(varname="DEPTH_max", **obs_float_template) + DEPTH_min = ds.createVariable(varname="DEPTH_min", **obs_float_template) + DEPTH_std = ds.createVariable(varname="DEPTH_std", **obs_float_template) + DEPTH_count = ds.createVariable(varname="DEPTH_count", **obs_int_template) + + TIME = ds.createVariable(varname="TIME", **obs_double_template) + instrument_index = ds.createVariable(varname="instrument_index", **obs_int_template) + + source_file = ds.createVariable(varname="source_file", **inst_S256_template) + instrument_id = ds.createVariable(varname="instrument_id", **inst_S256_template) + LATITUDE = ds.createVariable(varname="LATITUDE", **inst_double_template) + LONGITUDE = ds.createVariable(varname="LONGITUDE", **inst_double_template) + NOMINAL_DEPTH = ds.createVariable(varname="NOMINAL_DEPTH", **inst_float_template) + SECONDS_TO_MIDDLE = ds.createVariable( + varname="SECONDS_TO_MIDDLE", **inst_float_template + ) + CELL_INDEX = ds.createVariable(varname="CELL_INDEX", **obs_int_template) ## main loop - print('PROCESSING...') + print("PROCESSING...") slice_start = 0 for index, file in enumerate(files_to_agg): print(index, end=",", flush=True) @@ -186,23 +217,29 @@ def velocity_hourly_aggregated(files_to_agg, site_code, input_dir='', output_dir with xr.open_dataset(os.path.join(input_dir, file)) as nc: - is_2D = 'HEIGHT_ABOVE_SENSOR' in list(nc.variables) + is_2D = "HEIGHT_ABOVE_SENSOR" in list(nc.variables) varlist_nc = [v for v in varlist if v in nc.variables.keys()] ## mask values with QC flag>2 for var in varlist_nc: - nc[var] = nc[var].where(nc[var+'_quality_control'] <= QC_FLAG_MAX) + nc[var] = nc[var].where(nc[var + "_quality_control"] <= QC_FLAG_MAX) ## process in chunks ## in water only - chunk_start = max(np.datetime64(nc.attrs['time_deployment_start']), nc.TIME.data.min()) - chunk_end = min(np.datetime64(nc.attrs['time_deployment_end']), nc.TIME.data.max()) - time_increment = 60*60*24*chunk_size ## secs x mins x hours x days - chunk_increment = np.timedelta64(time_increment, 's') + chunk_start = max( + np.datetime64(nc.attrs["time_deployment_start"]), nc.TIME.data.min() + ) + chunk_end = min( + np.datetime64(nc.attrs["time_deployment_end"]), nc.TIME.data.max() + ) + time_increment = 60 * 60 * 24 * chunk_size ## secs x mins x hours x days + chunk_increment = np.timedelta64(time_increment, "s") chunk_partial = chunk_start + chunk_increment chunk_index = 0 while chunk_start < chunk_partial and chunk_start <= chunk_end: - nc_chunk = nc.where((nc.TIME >= chunk_start) & (nc.TIME < chunk_partial), drop=True) + nc_chunk = nc.where( + (nc.TIME >= chunk_start) & (nc.TIME < chunk_partial), drop=True + ) if is_2D: ## process all cells, one by one heights = nc_chunk.HEIGHT_ABOVE_SENSOR.values @@ -210,30 +247,41 @@ def velocity_hourly_aggregated(files_to_agg, site_code, input_dir='', output_dir ## get cell data, drop HEIGHT_ABOVE_SENSOR dim nc_cell = nc_chunk.sel(HEIGHT_ABOVE_SENSOR=cell_height) ## convert to absolute DEPTH - nc_cell['DEPTH'] = nc_cell['DEPTH'] - cell_height - slice_end = append_resampled_values(nc_cell[varlist_nc], ds, slice_start, binning_fun) - CELL_INDEX[slice_start:slice_end] = np.full(slice_end - slice_start, cell_idx, dtype=np.uint32) + nc_cell["DEPTH"] = nc_cell["DEPTH"] - cell_height + slice_end = append_resampled_values( + nc_cell[varlist_nc], ds, slice_start, binning_fun + ) + CELL_INDEX[slice_start:slice_end] = np.full( + slice_end - slice_start, cell_idx, dtype=np.uint32 + ) slice_start = slice_end else: - slice_end = append_resampled_values(nc_chunk[varlist_nc], ds, slice_start, binning_fun) - CELL_INDEX[slice_start:slice_end] = np.full(slice_end - slice_start, 0, dtype=np.uint32) + slice_end = append_resampled_values( + nc_chunk[varlist_nc], ds, slice_start, binning_fun + ) + CELL_INDEX[slice_start:slice_end] = np.full( + slice_end - slice_start, 0, dtype=np.uint32 + ) slice_start = slice_end chunk_start = chunk_partial chunk_partial += chunk_increment chunk_index += 1 - ## metadata variables - instrument_index[slice_instrument_start:slice_end] = np.repeat(index, slice_end - slice_instrument_start) + instrument_index[slice_instrument_start:slice_end] = np.repeat( + index, slice_end - slice_instrument_start + ) LATITUDE[index] = nc.LATITUDE.values LONGITUDE[index] = nc.LONGITUDE.values NOMINAL_DEPTH[index] = np.array(utils.get_nominal_depth(nc)) - source_file[index] = stringtochar(np.array(file, dtype='S256')) - instrument_id[index] = stringtochar(np.array(utils.get_instrument_id(nc), dtype='S256')) + source_file[index] = stringtochar(np.array(file, dtype="S256")) + instrument_id[index] = stringtochar( + np.array(utils.get_instrument_id(nc), dtype="S256") + ) ## add time offset to the middle of the measuring window, if it exists - if 'seconds_to_middle_of_measurement' in nc.TIME.attrs: + if "seconds_to_middle_of_measurement" in nc.TIME.attrs: SECONDS_TO_MIDDLE[index] = nc.TIME.seconds_to_middle_of_measurement else: SECONDS_TO_MIDDLE[index] = np.nan @@ -242,95 +290,162 @@ def velocity_hourly_aggregated(files_to_agg, site_code, input_dir='', output_dir ## add atributes with open(TEMPLATE_JSON) as json_file: attribute_dictionary = json.load(json_file) - variable_attribute_dictionary = attribute_dictionary['_variables'] - global_attribute_dictionary = attribute_dictionary['_global'] + variable_attribute_dictionary = attribute_dictionary["_variables"] + global_attribute_dictionary = attribute_dictionary["_global"] ## set variable attrs for var in list(ds.variables): ds[var].setncatts(variable_attribute_dictionary[var]) if download_url_prefix or opendap_url_prefix: - ds['source_file'].setncatts(utils.source_file_attributes(download_url_prefix, opendap_url_prefix)) + ds["source_file"].setncatts( + utils.source_file_attributes(download_url_prefix, opendap_url_prefix) + ) ## set global attrs - time_start = num2date(np.min(TIME[:]), TIME_UNITS, TIME_CALENDAR).strftime(TIMESTAMP_FORMAT) - time_end = num2date(np.max(TIME[:]), TIME_UNITS, TIME_CALENDAR).strftime(TIMESTAMP_FORMAT) - time_start_filename = num2date(np.min(TIME[:]), TIME_UNITS, TIME_CALENDAR).strftime(DATESTAMP_FORMAT) - time_end_filename = num2date(np.max(TIME[:]), TIME_UNITS, TIME_CALENDAR).strftime(DATESTAMP_FORMAT) - + time_start = num2date(np.min(TIME[:]), TIME_UNITS, TIME_CALENDAR).strftime( + TIMESTAMP_FORMAT + ) + time_end = num2date(np.max(TIME[:]), TIME_UNITS, TIME_CALENDAR).strftime( + TIMESTAMP_FORMAT + ) + time_start_filename = num2date(np.min(TIME[:]), TIME_UNITS, TIME_CALENDAR).strftime( + DATESTAMP_FORMAT + ) + time_end_filename = num2date(np.max(TIME[:]), TIME_UNITS, TIME_CALENDAR).strftime( + DATESTAMP_FORMAT + ) add_attribute = { - 'title': ("Long Timeseries Velocity Hourly Aggregated product: " + ', '.join(varlist) + " at " + - site_code + " between " + time_start + " and " + time_end), - 'site_code': site_code, - 'time_coverage_start': time_start, - 'time_coverage_end': time_end, - 'geospatial_vertical_min': np.float32(np.nanmin(ds['DEPTH'])), - 'geospatial_vertical_max': np.float32(np.nanmax(ds['DEPTH'])), - 'geospatial_lat_min': np.float64(np.min(ds['LATITUDE'])), - 'geospatial_lat_max': np.float64(np.max(ds['LATITUDE'])), - 'geospatial_lon_min': np.float64(np.min(ds['LONGITUDE'])), - 'geospatial_lon_max': np.float64(np.max(ds['LONGITUDE'])), - 'date_created': current_utc_timestamp(), - 'history': current_utc_timestamp() + ': Aggregated file created.', - 'keywords': ', '.join(varlist + ['AGGREGATED']), - 'rejected_files': "\n".join(bad_files.keys()), - 'generating_code_version': __version__ + "title": ( + "Long Timeseries Velocity Hourly Aggregated product: " + + ", ".join(varlist) + + " at " + + site_code + + " between " + + time_start + + " and " + + time_end + ), + "site_code": site_code, + "time_coverage_start": time_start, + "time_coverage_end": time_end, + "geospatial_vertical_min": np.float32(np.nanmin(ds["DEPTH"])), + "geospatial_vertical_max": np.float32(np.nanmax(ds["DEPTH"])), + "geospatial_lat_min": np.float64(np.min(ds["LATITUDE"])), + "geospatial_lat_max": np.float64(np.max(ds["LATITUDE"])), + "geospatial_lon_min": np.float64(np.min(ds["LONGITUDE"])), + "geospatial_lon_max": np.float64(np.max(ds["LONGITUDE"])), + "date_created": current_utc_timestamp(), + "history": current_utc_timestamp() + ": Aggregated file created.", + "keywords": ", ".join(varlist + ["AGGREGATED"]), + "rejected_files": "\n".join(bad_files.keys()), + "generating_code_version": __version__, } - add_attribute.update(utils.get_contributors(files_to_agg=files_to_agg, input_dir=input_dir)) + add_attribute.update( + utils.get_contributors(files_to_agg=files_to_agg, input_dir=input_dir) + ) ## add version - github_comment = ('\nThis file was created using https://github.com/aodn/python-aodntools/blob/' - '{v}/aodntools/timeseries_products/{f}'.format(v=__version__, f=os.path.basename(__file__)) - ) - global_attribute_dictionary['lineage'] += github_comment + github_comment = ( + "\nThis file was created using https://github.com/aodn/python-aodntools/blob/" + "{v}/aodntools/timeseries_products/{f}".format( + v=__version__, f=os.path.basename(__file__) + ) + ) + global_attribute_dictionary["lineage"] += github_comment global_attribute_dictionary.update(add_attribute) ds.setncatts(dict(sorted(global_attribute_dictionary.items()))) - ## NOTE: There is a possibility of having NaNs in DEPTH after the binning ## this is the warning when calculating the min/max DEPTH ## maybe I should clean the dataset before close it ds.close() - - ## create the output file name and rename the tmp file facility_code = utils.get_facility_code(os.path.join(input_dir, files_to_agg[0])) - data_code = 'VZ' - product_type = 'hourly-timeseries' + data_code = "VZ" + product_type = "hourly-timeseries" file_version = 2 - output_name = '_'.join(['IMOS', facility_code, data_code, time_start_filename, site_code, ('FV0'+str(file_version)), - ("velocity-"+product_type), - ('END-'+ time_end_filename), 'C-' + current_utc_timestamp(DATESTAMP_FORMAT)]) + '.nc' + output_name = ( + "_".join( + [ + "IMOS", + facility_code, + data_code, + time_start_filename, + site_code, + ("FV0" + str(file_version)), + ("velocity-" + product_type), + ("END-" + time_end_filename), + "C-" + current_utc_timestamp(DATESTAMP_FORMAT), + ] + ) + + ".nc" + ) ncout_path = os.path.join(output_dir, output_name) shutil.move(temp_outfile, ncout_path) - return ncout_path, bad_files if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Concatenate X,Y,Z velocity variables from ALL instruments from ALL deployments from ONE site") - parser.add_argument('-site', dest='site_code', help='site code, like NRMMAI', required=True) - parser.add_argument('-files', dest='filenames', help='name of the file that contains the source URLs', required=True) - parser.add_argument('-indir', dest='input_dir', help='base path of input files', default='', required=False) - parser.add_argument('-outdir', dest='output_dir', help='path where the result file will be written. Default ./', - default='./', required=False) - parser.add_argument('-download_url', dest='download_url', help='path to the download_url_prefix', - default='', required=False) - parser.add_argument('-opendap_url', dest='opendap_url', help='path to the opendap_url_prefix', - default='', required=False) + parser = argparse.ArgumentParser( + description="Concatenate X,Y,Z velocity variables from ALL instruments from ALL deployments from ONE site" + ) + parser.add_argument( + "-site", dest="site_code", help="site code, like NRMMAI", required=True + ) + parser.add_argument( + "-files", + dest="filenames", + help="name of the file that contains the source URLs", + required=True, + ) + parser.add_argument( + "-indir", + dest="input_dir", + help="base path of input files", + default="", + required=False, + ) + parser.add_argument( + "-outdir", + dest="output_dir", + help="path where the result file will be written. Default ./", + default="./", + required=False, + ) + parser.add_argument( + "-download_url", + dest="download_url", + help="path to the download_url_prefix", + default="", + required=False, + ) + parser.add_argument( + "-opendap_url", + dest="opendap_url", + help="path to the opendap_url_prefix", + default="", + required=False, + ) args = parser.parse_args() with open(args.filenames) as ff: files_to_agg = [line.rstrip() for line in ff] - - print(velocity_hourly_aggregated(files_to_agg=files_to_agg, site_code=args.site_code, - input_dir=args.input_dir, output_dir=args.output_dir, - download_url_prefix=args.download_url, opendap_url_prefix=args.opendap_url)) + print( + velocity_hourly_aggregated( + files_to_agg=files_to_agg, + site_code=args.site_code, + input_dir=args.input_dir, + output_dir=args.output_dir, + download_url_prefix=args.download_url, + opendap_url_prefix=args.opendap_url, + ) + ) diff --git a/examples/rottnest.py b/examples/rottnest.py index 41d6e0d..1e8b15e 100644 --- a/examples/rottnest.py +++ b/examples/rottnest.py @@ -14,37 +14,42 @@ from aodntools.ncwriter import ImosTemplate, TIMESTAMP_FORMAT EXAMPLES_PATH = os.path.dirname(__file__) -TEMPLATE_JSON = os.path.join(EXAMPLES_PATH, 'rottnest.json') -DATA_CSV = os.path.join(EXAMPLES_PATH, 'rottnest.csv') +TEMPLATE_JSON = os.path.join(EXAMPLES_PATH, "rottnest.json") +DATA_CSV = os.path.join(EXAMPLES_PATH, "rottnest.csv") # read data from CSV -df = pd.read_csv(DATA_CSV, parse_dates=['TIME']) +df = pd.read_csv(DATA_CSV, parse_dates=["TIME"]) # create template template = ImosTemplate.from_json(TEMPLATE_JSON) # update attributes -for att in ('site_code', 'platform_code', 'deployment_code', 'instrument_nominal_depth'): +for att in ( + "site_code", + "platform_code", + "deployment_code", + "instrument_nominal_depth", +): template.global_attributes[att] = df[att].unique()[0] # add data -t_data = df['TIME'].dt.to_pydatetime() -t_template = template.variables['TIME'] -t_template['_data'] = date2num(t_data, t_template['units'], t_template['calendar']) +t_data = df["TIME"].dt.to_pydatetime() +t_template = template.variables["TIME"] +t_template["_data"] = date2num(t_data, t_template["units"], t_template["calendar"]) -template.variables['LATITUDE']['_data'] = df['LATITUDE'].unique()[0] -template.variables['LONGITUDE']['_data'] = df['LONGITUDE'].unique()[0] +template.variables["LATITUDE"]["_data"] = df["LATITUDE"].unique()[0] +template.variables["LONGITUDE"]["_data"] = df["LONGITUDE"].unique()[0] for name, var in template.variables.items(): - if '_data' not in var: - var['_data'] = df[name].values + if "_data" not in var: + var["_data"] = df[name].values # convert valid_min/max attributes to match variable type # TODO: make this a template method for name, var in template.variables.items(): - var_type = var['_datatype'] - for attr in ('valid_min', 'valid_max'): + var_type = var["_datatype"] + for attr in ("valid_min", "valid_max"): if attr in var: var[attr] = np.cast[var_type](var[attr]) @@ -53,10 +58,12 @@ # add creation date template.add_date_created_attribute() -template.global_attributes['history'] = "{}: File created".format(template.date_created.strftime(TIMESTAMP_FORMAT)) +template.global_attributes["history"] = "{}: File created".format( + template.date_created.strftime(TIMESTAMP_FORMAT) +) # generate file name -outfile = 'rottnest.nc' +outfile = "rottnest.nc" # write file template.to_netcdf(outfile) diff --git a/setup.py b/setup.py index 4a2ab09..5519c14 100644 --- a/setup.py +++ b/setup.py @@ -1,56 +1,54 @@ from setuptools import setup, find_packages INSTALL_REQUIRES = [ - 'jsonschema>=2.6.0,<3.0.0', - 'numpy>=1.13.0', - 'netCDF4>=1.5.3', - 'pandas>=0.24.2', - 'xarray>=0.11.3' + "jsonschema>=2.6.0,<3.0.0", + "numpy>=1.13.0", + "netCDF4>=1.5.3", + "pandas>=0.24.2", + "xarray>=0.11.3", ] TESTS_REQUIRE = [ - 'pytest', + "pytest", # Need to manually install Black while we support Python 3.5 # 'black' ] -EXTRAS_REQUIRE = { - 'testing': TESTS_REQUIRE -} +EXTRAS_REQUIRE = {"testing": TESTS_REQUIRE} PACKAGE_DATA = { - 'aodntools.ncwriter': ['*.json'], - 'aodntools.timeseries_products': ['*.json'] + "aodntools.ncwriter": ["*.json"], + "aodntools.timeseries_products": ["*.json"], } -PACKAGE_EXCLUDES = ['test_aodntools.*', 'test_aodntools'] -PACKAGE_NAME = 'aodntools' +PACKAGE_EXCLUDES = ["test_aodntools.*", "test_aodntools"] +PACKAGE_NAME = "aodntools" setup( name=PACKAGE_NAME, - version='0.0.0', + version="0.0.0", packages=find_packages(exclude=PACKAGE_EXCLUDES), package_data=PACKAGE_DATA, - url='https://github.com/aodn', - license='GPLv3', - author='AODN', - author_email='projectofficers@emii.org.au', - description='AODN data tools library', + url="https://github.com/aodn", + license="GPLv3", + author="AODN", + author_email="projectofficers@emii.org.au", + description="AODN data tools library", zip_safe=False, - python_requires='>=3.5', + python_requires=">=3.5", install_requires=INSTALL_REQUIRES, tests_require=TESTS_REQUIRE, extras_require=EXTRAS_REQUIRE, - test_suite='test_aodntools', + test_suite="test_aodntools", classifiers=[ - 'Development Status :: 5 - Production/Stable', - 'Intended Audience :: Developers', - 'Natural Language :: English', - 'License :: OSI Approved :: GNU General Public License v3 (GPLv3)', - 'Programming Language :: Python', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: Implementation :: CPython', - ] + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "Natural Language :: English", + "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.5", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: Implementation :: CPython", + ], ) diff --git a/test_aodntools/base_test.py b/test_aodntools/base_test.py index 2d51428..e98c8ae 100644 --- a/test_aodntools/base_test.py +++ b/test_aodntools/base_test.py @@ -11,51 +11,67 @@ class BaseTestCase(unittest.TestCase): @property def temp_dir(self): - if not hasattr(self, '_temp_dir'): + if not hasattr(self, "_temp_dir"): self._temp_dir = tempfile.mkdtemp(prefix=self.__class__.__name__) return self._temp_dir @property def temp_nc_file(self): - if not hasattr(self, '_temp_nc_file'): - with tempfile.NamedTemporaryFile(suffix='.nc', prefix=self.__class__.__name__, dir=self.temp_dir) as f: + if not hasattr(self, "_temp_nc_file"): + with tempfile.NamedTemporaryFile( + suffix=".nc", prefix=self.__class__.__name__, dir=self.temp_dir + ) as f: pass self._temp_nc_file = f.name return self._temp_nc_file def tearDown(self): - if hasattr(self, '_temp_dir'): + if hasattr(self, "_temp_dir"): shutil.rmtree(self._temp_dir) - def compare_global_attributes(self, dataset, - attrs = ('geospatial_lat_max', 'geospatial_lat_min', - 'geospatial_lon_max', 'geospatial_lon_min', - 'geospatial_vertical_max', 'geospatial_vertical_min', - 'time_coverage_start', 'time_coverage_end' - ) - ): + def compare_global_attributes( + self, + dataset, + attrs=( + "geospatial_lat_max", + "geospatial_lat_min", + "geospatial_lon_max", + "geospatial_lon_min", + "geospatial_vertical_max", + "geospatial_vertical_min", + "time_coverage_start", + "time_coverage_end", + ), + ): "Compare global attributes of the given dataset with those in self.EXPECTED_OUTPUT_FILE" not_matching = [] with Dataset(self.EXPECTED_OUTPUT_FILE) as expected: for attr in attrs: if dataset.getncattr(attr) != expected.getncattr(attr): - not_matching.append((attr, - "expected: {exp}; found: {found}".format(exp=dataset.getncattr(attr), - found=dataset.getncattr(attr)) - )) + not_matching.append( + ( + attr, + "expected: {exp}; found: {found}".format( + exp=dataset.getncattr(attr), + found=dataset.getncattr(attr), + ), + ) + ) self.assertEqual([], not_matching) def check_nan_values(self, dataset): "check that there are no NaN values in any variable (they should be fill values instead)" - nan_vars = [(name, "contains NaN values") - for name, var in dataset.variables.items() - if var.dtype in (np.dtype('float32'), np.dtype('float64')) and any(np.isnan(var[:])) - ] + nan_vars = [ + (name, "contains NaN values") + for name, var in dataset.variables.items() + if var.dtype in (np.dtype("float32"), np.dtype("float64")) + and any(np.isnan(var[:])) + ] self.assertEqual([], nan_vars) - def compare_variables(self, dataset, skip_vars=('source_file', 'instrument_id')): + def compare_variables(self, dataset, skip_vars=("source_file", "instrument_id")): """Compare dimensions and values of all variables in dataset with those in self.EXPECTED_OUTPUT_FILE, except for variables listed in skip_vars. """ diff --git a/test_aodntools/ncwriter/test_imos_template.py b/test_aodntools/ncwriter/test_imos_template.py index bf85219..760fdc1 100644 --- a/test_aodntools/ncwriter/test_imos_template.py +++ b/test_aodntools/ncwriter/test_imos_template.py @@ -14,12 +14,11 @@ "project": "Integrated Marine Observing System (IMOS)", "naming_authority": "IMOS", "data_centre": "Australian Ocean Data Network (AODN)", - "data_centre_email": "info@aodn.org.au" + "data_centre_email": "info@aodn.org.au", } class TestImosTemplate(TemplateTestCase): - def setUp(self): super(TestImosTemplate, self).setUp() self.template = ImosTemplate() @@ -32,9 +31,7 @@ def test_fixed_global_attributes(self): self.assertEqual(value, self.template.global_attributes[name]) def test_combine_global_attributes(self): - my_globals = {"title": "This is a test", - "project": "Test project" - } + my_globals = {"title": "This is a test", "project": "Test project"} expected = TEST_FIXED_GLOBALS.copy() expected.update(my_globals) template = ImosTemplate(global_attributes=my_globals) @@ -49,55 +46,71 @@ def test_date_created(self): def test_add_date_created_attribute(self): self.template.add_date_created_attribute() - parsed_date_created = datetime.strptime(self.template.global_attributes['date_created'], '%Y-%m-%dT%H:%M:%SZ') - self.assertTrue(self.template.date_created - parsed_date_created < timedelta(seconds=1)) + parsed_date_created = datetime.strptime( + self.template.global_attributes["date_created"], "%Y-%m-%dT%H:%M:%SZ" + ) + self.assertTrue( + self.template.date_created - parsed_date_created < timedelta(seconds=1) + ) def test_add_extent_attributes(self): - self.template.add_extent_attributes(time_var=None, vert_var=None, lat_var=None, lon_var=None) - for att in ('time_coverage_start', 'time_coverage_end', 'geospatial_vertical_min', 'geospatial_vertical_max', - 'geospatial_lat_min', 'geospatial_lat_max', 'geospatial_lon_min', 'geospatial_lon_max'): - self.assertEqual('', self.template.global_attributes[att]) + self.template.add_extent_attributes( + time_var=None, vert_var=None, lat_var=None, lon_var=None + ) + for att in ( + "time_coverage_start", + "time_coverage_end", + "geospatial_vertical_min", + "geospatial_vertical_max", + "geospatial_lat_min", + "geospatial_lat_max", + "geospatial_lon_min", + "geospatial_lon_max", + ): + self.assertEqual("", self.template.global_attributes[att]) self.template.variables = { - 'TIME': { - '_dimensions': ['TIME'], - '_datatype': 'float64', - '_data': np.array([0, 1, 2, np.nan, 25261.375]), - 'units': 'days since 1950-01-01 00:00:00 UTC' + "TIME": { + "_dimensions": ["TIME"], + "_datatype": "float64", + "_data": np.array([0, 1, 2, np.nan, 25261.375]), + "units": "days since 1950-01-01 00:00:00 UTC", }, - 'LATITUDE': { - '_dimensions': ['TIME'], - '_datatype': 'float32', - '_FillValue': -999., - '_data': np.array([-999., -999., -42, -43, 12]) + "LATITUDE": { + "_dimensions": ["TIME"], + "_datatype": "float32", + "_FillValue": -999.0, + "_data": np.array([-999.0, -999.0, -42, -43, 12]), }, - 'LONGITUDE': { - '_dimensions': ['TIME'], - '_datatype': 'float32', - '_data': np.arange(10) + "LONGITUDE": { + "_dimensions": ["TIME"], + "_datatype": "float32", + "_data": np.arange(10), }, - 'NOMINAL_DEPTH': { - '_datatype': 'float32', - '_data': 20 + "NOMINAL_DEPTH": {"_datatype": "float32", "_data": 20}, + "DEPTH": { + "_dimensions": ["TIME"], + "_datatype": "float32", + "_data": np.repeat(np.nan, 5), }, - 'DEPTH': { - '_dimensions': ['TIME'], - '_datatype': 'float32', - '_data': np.repeat(np.nan, 5) - } } - self.template.add_extent_attributes(vert_var='NOMINAL_DEPTH') - self.assertEqual('1950-01-01T00:00:00Z', self.template.global_attributes['time_coverage_start']) - self.assertEqual('2019-03-01T09:00:00Z', self.template.global_attributes['time_coverage_end']) - self.assertEqual(-43, self.template.global_attributes['geospatial_lat_min']) - self.assertEqual(12, self.template.global_attributes['geospatial_lat_max']) - self.assertEqual(0, self.template.global_attributes['geospatial_lon_min']) - self.assertEqual(9, self.template.global_attributes['geospatial_lon_max']) - self.assertEqual(20, self.template.global_attributes['geospatial_vertical_min']) - self.assertEqual(20, self.template.global_attributes['geospatial_vertical_max']) + self.template.add_extent_attributes(vert_var="NOMINAL_DEPTH") + self.assertEqual( + "1950-01-01T00:00:00Z", + self.template.global_attributes["time_coverage_start"], + ) + self.assertEqual( + "2019-03-01T09:00:00Z", self.template.global_attributes["time_coverage_end"] + ) + self.assertEqual(-43, self.template.global_attributes["geospatial_lat_min"]) + self.assertEqual(12, self.template.global_attributes["geospatial_lat_max"]) + self.assertEqual(0, self.template.global_attributes["geospatial_lon_min"]) + self.assertEqual(9, self.template.global_attributes["geospatial_lon_max"]) + self.assertEqual(20, self.template.global_attributes["geospatial_vertical_min"]) + self.assertEqual(20, self.template.global_attributes["geospatial_vertical_max"]) self.assertRaises(ValueError, self.template.add_extent_attributes) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/test_aodntools/ncwriter/test_schema.py b/test_aodntools/ncwriter/test_schema.py index 84c12a4..aa77aaf 100644 --- a/test_aodntools/ncwriter/test_schema.py +++ b/test_aodntools/ncwriter/test_schema.py @@ -4,105 +4,116 @@ import numpy as np -from aodntools.ncwriter.schema import (validate_template, validate_dimensions, validate_variables, - validate_global_attributes, ValidationError) +from aodntools.ncwriter.schema import ( + validate_template, + validate_dimensions, + validate_variables, + validate_global_attributes, + ValidationError, +) class TestSchema(unittest.TestCase): def test_validate_template(self): validate_template({}) - validate_template({'_dimensions': {'X': 1}}) - validate_template({'_variables': {'X': {'name': 'X'}}}) - validate_template({'title': 'test'}) + validate_template({"_dimensions": {"X": 1}}) + validate_template({"_variables": {"X": {"name": "X"}}}) + validate_template({"title": "test"}) - self.assertRaises(ValidationError, validate_template, {'_bad': 1}) - self.assertRaises(ValidationError, validate_template, {'_dimensions': None}) - self.assertRaises(ValidationError, validate_template, {'_variables': 1}) - self.assertRaises(ValidationError, validate_template, {'_variables': {'X': {'_dimensions': 1}}}) + self.assertRaises(ValidationError, validate_template, {"_bad": 1}) + self.assertRaises(ValidationError, validate_template, {"_dimensions": None}) + self.assertRaises(ValidationError, validate_template, {"_variables": 1}) + self.assertRaises( + ValidationError, + validate_template, + {"_variables": {"X": {"_dimensions": 1}}}, + ) def test_validate_dimensions(self): validate_dimensions({}) - validate_dimensions({'X': 1, 'Y': None}) + validate_dimensions({"X": 1, "Y": None}) with self.assertRaises(ValidationError): validate_dimensions(None) with self.assertRaises(ValidationError): - validate_dimensions('X') + validate_dimensions("X") with self.assertRaises(ValidationError): - validate_dimensions(['X']) + validate_dimensions(["X"]) with self.assertRaises(ValidationError): validate_dimensions(10) with self.assertRaises(ValidationError): - validate_dimensions({'123': 123}) + validate_dimensions({"123": 123}) with self.assertRaises(ValidationError): - validate_dimensions({'X': 'one'}) + validate_dimensions({"X": "one"}) with self.assertRaises(ValidationError): - validate_dimensions({'X': -1}) + validate_dimensions({"X": -1}) with self.assertRaises(ValidationError): - validate_dimensions({'X': 1.5}) + validate_dimensions({"X": 1.5}) def test_validate_variables(self): validate_variables({}) - validate_variables({'X': {}}) - validate_variables({'X': {'_datatype': 'float32'}}) - validate_variables({'X': {'_datatype': np.float32}}) - validate_variables({'X': {'_datatype': np.dtype('float32')}}) - validate_variables({'X': {'_dimensions': []}}) - validate_variables({'X': {'name': 'X'}}) - validate_variables({'X': {'_data': None}}) - validate_variables({'X': {'_data': 100}}) - validate_variables({'X': {'_data': np.array([1, 2])}}) - validate_variables({'X': {'_dimensions': ['X'], '_datatype': 'float32'}}) - validate_variables({ - 'X': { - '_dimensions': ['X'], - '_datatype': 'float32', - '_data': [42], - 'name': 'X', - 'count': 1 - }, - 'Y': {'_datatype': 'float64'} - }) + validate_variables({"X": {}}) + validate_variables({"X": {"_datatype": "float32"}}) + validate_variables({"X": {"_datatype": np.float32}}) + validate_variables({"X": {"_datatype": np.dtype("float32")}}) + validate_variables({"X": {"_dimensions": []}}) + validate_variables({"X": {"name": "X"}}) + validate_variables({"X": {"_data": None}}) + validate_variables({"X": {"_data": 100}}) + validate_variables({"X": {"_data": np.array([1, 2])}}) + validate_variables({"X": {"_dimensions": ["X"], "_datatype": "float32"}}) + validate_variables( + { + "X": { + "_dimensions": ["X"], + "_datatype": "float32", + "_data": [42], + "name": "X", + "count": 1, + }, + "Y": {"_datatype": "float64"}, + } + ) with self.assertRaises(ValidationError): validate_variables(None) with self.assertRaises(ValidationError): - validate_variables('VAR') + validate_variables("VAR") with self.assertRaises(ValidationError): - validate_variables({'__X': {}}) + validate_variables({"__X": {}}) with self.assertRaises(ValidationError): - validate_variables({'X': {'_unknown': 'else'}}) + validate_variables({"X": {"_unknown": "else"}}) with self.assertRaises(ValidationError): - validate_variables({'X': {'_datatype': 'no_such_type'}}) + validate_variables({"X": {"_datatype": "no_such_type"}}) with self.assertRaises(ValidationError): - validate_variables({'X': {'_datatype': 42}}) + validate_variables({"X": {"_datatype": 42}}) with self.assertRaises(ValidationError): - validate_variables({'X': {'_datatype': 'float32', '0': 'none'}}) + validate_variables({"X": {"_datatype": "float32", "0": "none"}}) def test_validate_attributes(self): validate_global_attributes({}) - validate_global_attributes({'name': 'test'}) - validate_global_attributes({'name': 1.5}) - validate_global_attributes({'name': [1, 2, 3]}) + validate_global_attributes({"name": "test"}) + validate_global_attributes({"name": 1.5}) + validate_global_attributes({"name": [1, 2, 3]}) with self.assertRaises(ValidationError): validate_global_attributes(None) with self.assertRaises(ValidationError): - validate_global_attributes('X') + validate_global_attributes("X") with self.assertRaises(ValidationError): validate_global_attributes([]) with self.assertRaises(ValidationError): - validate_global_attributes({'_dimensions': {}}) + validate_global_attributes({"_dimensions": {}}) with self.assertRaises(ValidationError): - validate_global_attributes({'_badname': 1}) + validate_global_attributes({"_badname": 1}) with self.assertRaises(ValidationError): - validate_global_attributes({'null': None}) + validate_global_attributes({"null": None}) with self.assertRaises(ValidationError): - validate_global_attributes({'bool': True}) + validate_global_attributes({"bool": True}) with self.assertRaises(ValidationError): - validate_global_attributes({'object': {}}) + validate_global_attributes({"object": {}}) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/test_aodntools/ncwriter/test_template.py b/test_aodntools/ncwriter/test_template.py index 3a878be..6393f52 100644 --- a/test_aodntools/ncwriter/test_template.py +++ b/test_aodntools/ncwriter/test_template.py @@ -12,60 +12,72 @@ from netCDF4 import Dataset from test_aodntools.base_test import BaseTestCase -from aodntools.ncwriter import DatasetTemplate, ValidationError, metadata_attributes, special_attributes +from aodntools.ncwriter import ( + DatasetTemplate, + ValidationError, + metadata_attributes, + special_attributes, +) TEST_ROOT = os.path.dirname(__file__) -TEMPLATE_JSON = os.path.join(TEST_ROOT, 'template1.json') -TEMPLATE_PARTIAL_JSON = os.path.join(TEST_ROOT, 'template_partial.json') -BAD_JSON = os.path.join(TEST_ROOT, 'bad.json') +TEMPLATE_JSON = os.path.join(TEST_ROOT, "template1.json") +TEMPLATE_PARTIAL_JSON = os.path.join(TEST_ROOT, "template_partial.json") +BAD_JSON = os.path.join(TEST_ROOT, "bad.json") class TestUtils(BaseTestCase): def test_metadata_attributes(self): self.assertEqual({}, metadata_attributes({})) - self.assertEqual({}, metadata_attributes({'_dimensions': {}, '_fill_value': -999})) - self.assertEqual({'title': 'Title'}, - metadata_attributes({'title': 'Title'}) - ) - self.assertEqual({'title': 'Title'}, - metadata_attributes({'title': 'Title', '_fill_value': -999}) - ) + self.assertEqual( + {}, metadata_attributes({"_dimensions": {}, "_fill_value": -999}) + ) + self.assertEqual({"title": "Title"}, metadata_attributes({"title": "Title"})) + self.assertEqual( + {"title": "Title"}, + metadata_attributes({"title": "Title", "_fill_value": -999}), + ) self.assertIsInstance(metadata_attributes(OrderedDict()), OrderedDict) def test_special_attributes(self): self.assertEqual({}, special_attributes({})) - self.assertEqual({}, special_attributes({'title': 'Title'})) - self.assertEqual({'dimensions': {}, 'fill_value': -999}, - special_attributes({'_dimensions': {}, '_fill_value': -999})) - self.assertEqual({'fill_value': -999}, - special_attributes({'title': 'Title', '_fill_value': -999})) + self.assertEqual({}, special_attributes({"title": "Title"})) + self.assertEqual( + {"dimensions": {}, "fill_value": -999}, + special_attributes({"_dimensions": {}, "_fill_value": -999}), + ) + self.assertEqual( + {"fill_value": -999}, + special_attributes({"title": "Title", "_fill_value": -999}), + ) class TemplateTestCase(unittest.TestCase): with open(TEMPLATE_JSON) as t: template_dict = json.load(t, object_pairs_hook=OrderedDict) - dimensions = template_dict['_dimensions'] - variables = template_dict['_variables'] + dimensions = template_dict["_dimensions"] + variables = template_dict["_variables"] global_attributes = metadata_attributes(template_dict) values1 = np.array([1], dtype=np.float32) values10 = np.arange(10, dtype=np.float32) @property def temp_dir(self): - if not hasattr(self, '_temp_dir'): + if not hasattr(self, "_temp_dir"): self._temp_dir = tempfile.mkdtemp(prefix=self.__class__.__name__) return self._temp_dir @property def temp_nc_file(self): - if not hasattr(self, '_temp_nc_file'): - with tempfile.NamedTemporaryFile(suffix='.nc', prefix=self.__class__.__name__, dir=self.temp_dir) as f: + if not hasattr(self, "_temp_nc_file"): + with tempfile.NamedTemporaryFile( + suffix=".nc", prefix=self.__class__.__name__, dir=self.temp_dir + ) as f: pass self._temp_nc_file = f.name return self._temp_nc_file def tearDown(self): - if hasattr(self, '_temp_dir'): + if hasattr(self, "_temp_dir"): shutil.rmtree(self._temp_dir) @@ -77,32 +89,36 @@ def test_init_empty(self): self.assertEqual({}, template.global_attributes) def test_init_from_dicts(self): - template = DatasetTemplate(dimensions=self.dimensions, - variables=self.variables, - global_attributes=self.global_attributes) + template = DatasetTemplate( + dimensions=self.dimensions, + variables=self.variables, + global_attributes=self.global_attributes, + ) self.assertEqual(self.dimensions, template.dimensions) self.assertEqual(self.variables, template.variables) self.assertEqual(self.global_attributes, template.global_attributes) def test_init_from_dicts_validation(self): with self.assertRaises(ValidationError): - DatasetTemplate(dimensions='X') + DatasetTemplate(dimensions="X") with self.assertRaises(ValidationError): - DatasetTemplate(dimensions={'TIME': -1}) + DatasetTemplate(dimensions={"TIME": -1}) with self.assertRaises(ValidationError): - DatasetTemplate(variables='TEMP') + DatasetTemplate(variables="TEMP") with self.assertRaises(ValidationError): - DatasetTemplate(variables={'_TEMP': {}}) + DatasetTemplate(variables={"_TEMP": {}}) with self.assertRaises(ValidationError): - DatasetTemplate(global_attributes='title') + DatasetTemplate(global_attributes="title") with self.assertRaises(ValidationError): - DatasetTemplate(global_attributes={'title': None}) + DatasetTemplate(global_attributes={"title": None}) def test_invalid_json(self): error_pattern = r"invalid JSON file '{}'".format(re.escape(BAD_JSON)) - self.assertRaisesRegexp(ValueError, error_pattern, DatasetTemplate.from_json, BAD_JSON) + self.assertRaisesRegexp( + ValueError, error_pattern, DatasetTemplate.from_json, BAD_JSON + ) def test_init_from_json(self): template = DatasetTemplate.from_json(TEMPLATE_JSON) @@ -115,32 +131,43 @@ def test_init_from_partial_template(self): with open(TEMPLATE_PARTIAL_JSON) as t: tdict = json.load(t, object_pairs_hook=OrderedDict) self.assertEqual({}, template.dimensions) - self.assertEqual(tdict['_variables'], template.variables) + self.assertEqual(tdict["_variables"], template.variables) self.assertEqual(metadata_attributes(tdict), template.global_attributes) def test_add_method(self): - template1 = DatasetTemplate(dimensions={'ONE': 1}, - variables={'X': {'_dimensions': ['ONE'], '_datatype': 'float32'}, - 'Y': {'_dimensions': ['ONE'], '_datatype': 'float32'} - }, - global_attributes={'title': 'First template', 'comment': 'one'} - ) - template2 = DatasetTemplate(dimensions={'TWO': 2}, - variables={'Y': {'_dimensions': ['TWO'], 'comment': 'updated'}, - 'Z': {'name': 'new'} - }, - global_attributes={'title': 'Second template', 'version': 2} - ) + template1 = DatasetTemplate( + dimensions={"ONE": 1}, + variables={ + "X": {"_dimensions": ["ONE"], "_datatype": "float32"}, + "Y": {"_dimensions": ["ONE"], "_datatype": "float32"}, + }, + global_attributes={"title": "First template", "comment": "one"}, + ) + template2 = DatasetTemplate( + dimensions={"TWO": 2}, + variables={ + "Y": {"_dimensions": ["TWO"], "comment": "updated"}, + "Z": {"name": "new"}, + }, + global_attributes={"title": "Second template", "version": 2}, + ) template = template1 + template2 - self.assertEqual({'ONE': 1, 'TWO': 2}, template.dimensions) - self.assertEqual({'title': 'Second template', 'comment': 'one', 'version': 2}, template.global_attributes) + self.assertEqual({"ONE": 1, "TWO": 2}, template.dimensions) + self.assertEqual( + {"title": "Second template", "comment": "one", "version": 2}, + template.global_attributes, + ) - self.assertSetEqual({'X', 'Y', 'Z'}, set(template.variables.keys())) - self.assertEqual({'_dimensions': ['ONE'], '_datatype': 'float32'}, template.variables['X']) - self.assertEqual({'_dimensions': ['TWO'], '_datatype': 'float32', 'comment': 'updated'}, - template.variables['Y']) - self.assertEqual({'name': 'new'}, template.variables['Z']) + self.assertSetEqual({"X", "Y", "Z"}, set(template.variables.keys())) + self.assertEqual( + {"_dimensions": ["ONE"], "_datatype": "float32"}, template.variables["X"] + ) + self.assertEqual( + {"_dimensions": ["TWO"], "_datatype": "float32", "comment": "updated"}, + template.variables["Y"], + ) + self.assertEqual({"name": "new"}, template.variables["Z"]) # TODO: def test_json_validation(self): @@ -153,43 +180,49 @@ def test_add_global_attributes(self): def test_add_dimensions(self): template = DatasetTemplate.from_json(TEMPLATE_PARTIAL_JSON) - template.dimensions['TIME'] = 100 - template.dimensions['DEPTH'] = 10 - self.assertEqual(OrderedDict([('TIME', 100), ('DEPTH', 10)]), template.dimensions) + template.dimensions["TIME"] = 100 + template.dimensions["DEPTH"] = 10 + self.assertEqual( + OrderedDict([("TIME", 100), ("DEPTH", 10)]), template.dimensions + ) def test_change_dimensions(self): template = DatasetTemplate.from_json(TEMPLATE_JSON) - template.dimensions['TIME'] = 100 - template.dimensions['DEPTH'] = 10 - self.assertEqual(OrderedDict([('TIME', 100), ('DEPTH', 10)]), template.dimensions) + template.dimensions["TIME"] = 100 + template.dimensions["DEPTH"] = 10 + self.assertEqual( + OrderedDict([("TIME", 100), ("DEPTH", 10)]), template.dimensions + ) def test_add_variables(self): template = DatasetTemplate.from_json(TEMPLATE_PARTIAL_JSON) - template.variables['TIME'] = self.variables['TIME'] - self.assertEqual({'TEMP', 'TIME'}, set(template.variables.keys())) - self.assertEqual(self.variables['TIME'], template.variables['TIME']) + template.variables["TIME"] = self.variables["TIME"] + self.assertEqual({"TEMP", "TIME"}, set(template.variables.keys())) + self.assertEqual(self.variables["TIME"], template.variables["TIME"]) def test_add_variable_dimensions(self): template = DatasetTemplate.from_json(TEMPLATE_PARTIAL_JSON) - template.variables['TEMP']['_dimensions'] = ['TIME', 'DEPTH'] - self.assertEqual(['TIME', 'DEPTH'], template.variables['TEMP']['_dimensions']) + template.variables["TEMP"]["_dimensions"] = ["TIME", "DEPTH"] + self.assertEqual(["TIME", "DEPTH"], template.variables["TEMP"]["_dimensions"]) def test_add_variable_attributes(self): template = DatasetTemplate.from_json(TEMPLATE_PARTIAL_JSON) - template.variables['TEMP'].update([('units', 'Kelvin'), - ('comment', 'ok') - ]) - self.assertEqual(OrderedDict([('standard_name', 'sea_water_temperature'), - ('units', 'Kelvin'), - ('comment', 'ok') - ]), - template.variables['TEMP'] - ) + template.variables["TEMP"].update([("units", "Kelvin"), ("comment", "ok")]) + self.assertEqual( + OrderedDict( + [ + ("standard_name", "sea_water_temperature"), + ("units", "Kelvin"), + ("comment", "ok"), + ] + ), + template.variables["TEMP"], + ) def test_set_variable_values(self): template = DatasetTemplate.from_json(TEMPLATE_JSON) - template.variables['TEMP']['_data'] = self.values10 - self.assertTrue(all(template.variables['TEMP']['_data'] == self.values10)) + template.variables["TEMP"]["_data"] = self.values10 + self.assertTrue(all(template.variables["TEMP"]["_data"] == self.values10)) def test_create_empty_file(self): template = DatasetTemplate() @@ -197,52 +230,59 @@ def test_create_empty_file(self): dataset = Dataset(self.temp_nc_file) def test_create_empty_variable(self): - template = DatasetTemplate(dimensions={'X': 10}) - template.variables['X'] = {'_dimensions': ['X'], '_datatype': 'float32'} - self.assertRaises(ValidationError, template.to_netcdf, self.temp_nc_file) # not providing '_data' is an error + template = DatasetTemplate(dimensions={"X": 10}) + template.variables["X"] = {"_dimensions": ["X"], "_datatype": "float32"} + self.assertRaises( + ValidationError, template.to_netcdf, self.temp_nc_file + ) # not providing '_data' is an error del self._temp_nc_file # Get a new temp file - template.variables['X']['_data'] = None # This is ok, it's a shortcut for all fill values + template.variables["X"][ + "_data" + ] = None # This is ok, it's a shortcut for all fill values template.to_netcdf(self.temp_nc_file) dataset = Dataset(self.temp_nc_file) dataset.set_auto_mask(True) - dsx = dataset.variables['X'] + dsx = dataset.variables["X"] self.assertIsInstance(dsx[:], np.ma.MaskedArray) self.assertTrue(dsx[:].mask.all()) def test_create_file(self): template = DatasetTemplate.from_json(TEMPLATE_JSON) - template.variables['TIME']['_data'] = self.values10 - template.variables['DEPTH']['_data'] = self.values1 - template.variables['TEMP']['_data'] = self.values10.reshape((10, 1)) + template.variables["TIME"]["_data"] = self.values10 + template.variables["DEPTH"]["_data"] = self.values1 + template.variables["TEMP"]["_data"] = self.values10.reshape((10, 1)) template.to_netcdf(self.temp_nc_file) dataset = Dataset(self.temp_nc_file) - expected_dimensions = OrderedDict([ - ('TIME', len(self.values10)), - ('DEPTH', len(self.values1)) - ]) + expected_dimensions = OrderedDict( + [("TIME", len(self.values10)), ("DEPTH", len(self.values1))] + ) ds_dimensions = OrderedDict((k, v.size) for k, v in dataset.dimensions.items()) self.assertEqual(expected_dimensions, ds_dimensions) for vname, vdict in self.variables.items(): ds_var = dataset[vname] - self.assertEqual(vdict['_dimensions'], list(ds_var.dimensions)) - self.assertEqual(vdict['_datatype'], ds_var.dtype) - ds_var_attr = OrderedDict((k, ds_var.getncattr(k)) for k in ds_var.ncattrs()) + self.assertEqual(vdict["_dimensions"], list(ds_var.dimensions)) + self.assertEqual(vdict["_datatype"], ds_var.dtype) + ds_var_attr = OrderedDict( + (k, ds_var.getncattr(k)) for k in ds_var.ncattrs() + ) self.assertEqual(metadata_attributes(vdict), ds_var_attr) - self.assertTrue(all(dataset['TIME'] == self.values10)) - self.assertTrue(all(dataset['DEPTH'] == self.values1)) - self.assertTrue(all(dataset['TEMP'] == self.values10.reshape(10, 1))) + self.assertTrue(all(dataset["TIME"] == self.values10)) + self.assertTrue(all(dataset["DEPTH"] == self.values1)) + self.assertTrue(all(dataset["TEMP"] == self.values10.reshape(10, 1))) - ds_global_attributes = OrderedDict((k, dataset.getncattr(k)) for k in dataset.ncattrs()) + ds_global_attributes = OrderedDict( + (k, dataset.getncattr(k)) for k in dataset.ncattrs() + ) self.assertEqual(self.global_attributes, ds_global_attributes) def test_close_file_on_exception(self): - template = DatasetTemplate(variables={'Z': {}}) + template = DatasetTemplate(variables={"Z": {}}) self.assertIsNone(template.ncobj) self.assertRaises(ValidationError, template.to_netcdf, self.temp_nc_file) self.assertIsNone(template.ncobj) @@ -250,153 +290,175 @@ def test_close_file_on_exception(self): # TODO: Use mock to make this fail *after* ncobj is created def test_dimensionless_variable(self): - template = DatasetTemplate(variables={'X': {'_datatype': 'double', '_data': np.array(1)}}) + template = DatasetTemplate( + variables={"X": {"_datatype": "double", "_data": np.array(1)}} + ) template.to_netcdf(self.temp_nc_file) dataset = Dataset(self.temp_nc_file) - self.assertEqual((), dataset.variables['X'].dimensions) + self.assertEqual((), dataset.variables["X"].dimensions) def test_ensure_completeness(self): - template = DatasetTemplate(dimensions={'X': 1}) + template = DatasetTemplate(dimensions={"X": 1}) template.variables = { - 'A': {'_dimensions': ['X'], '_datatype': 'float32', '_data': [12.3]}, - 'B': {'_dimensions': ['X'], '_data': [12.3]}, - 'X': {'_dimensions': ['X'], '_data': self.values1}, - 'Y': {'_datatype': 'float32', '_data': None} + "A": {"_dimensions": ["X"], "_datatype": "float32", "_data": [12.3]}, + "B": {"_dimensions": ["X"], "_data": [12.3]}, + "X": {"_dimensions": ["X"], "_data": self.values1}, + "Y": {"_datatype": "float32", "_data": None}, } template.ensure_completeness() - self.assertEqual(['X'], template.variables['A']['_dimensions']) - self.assertEqual(np.dtype('float32'), template.variables['A']['_datatype']) - self.assertEqual([12.3], template.variables['A']['_data']) - self.assertIsInstance(template.variables['A']['_data'], np.ndarray) + self.assertEqual(["X"], template.variables["A"]["_dimensions"]) + self.assertEqual(np.dtype("float32"), template.variables["A"]["_datatype"]) + self.assertEqual([12.3], template.variables["A"]["_data"]) + self.assertIsInstance(template.variables["A"]["_data"], np.ndarray) - self.assertEqual(np.dtype('float64'), template.variables['B']['_datatype']) + self.assertEqual(np.dtype("float64"), template.variables["B"]["_datatype"]) - self.assertIs(self.values1.dtype, template.variables['X']['_datatype']) + self.assertIs(self.values1.dtype, template.variables["X"]["_datatype"]) - self.assertEqual([], template.variables['Y']['_dimensions']) + self.assertEqual([], template.variables["Y"]["_dimensions"]) - template.variables = {'Z': {'_dimensions': [], '_data': None}} - self.assertRaisesRegexp(ValidationError, r"No data type information for variable 'Z'", - template.ensure_completeness) + template.variables = {"Z": {"_dimensions": [], "_data": None}} + self.assertRaisesRegexp( + ValidationError, + r"No data type information for variable 'Z'", + template.ensure_completeness, + ) - template.variables = {'Z': {'_dimensions': []}} - self.assertRaisesRegexp(ValidationError, r"No data specified for variable 'Z'", - template.ensure_completeness) + template.variables = {"Z": {"_dimensions": []}} + self.assertRaisesRegexp( + ValidationError, + r"No data specified for variable 'Z'", + template.ensure_completeness, + ) def test_ensure_consistency(self): template = DatasetTemplate() - scalar = {'_dimensions': [], '_data': np.array(1)} - template.variables = {'SCALAR': scalar} + scalar = {"_dimensions": [], "_data": np.array(1)} + template.variables = {"SCALAR": scalar} template.ensure_consistency() self.assertEqual({}, template.dimensions) - self.assertIs(scalar, template.variables['SCALAR']) + self.assertIs(scalar, template.variables["SCALAR"]) - template = DatasetTemplate(dimensions={'TEN': 10}) - var_10 = {'_dimensions': ['TEN'], '_data': self.values10} - template.variables = {'TEN': var_10} + template = DatasetTemplate(dimensions={"TEN": 10}) + var_10 = {"_dimensions": ["TEN"], "_data": self.values10} + template.variables = {"TEN": var_10} template.ensure_consistency() - self.assertEqual({'TEN': 10}, template.dimensions) - self.assertIs(var_10, template.variables['TEN']) + self.assertEqual({"TEN": 10}, template.dimensions) + self.assertIs(var_10, template.variables["TEN"]) - template = DatasetTemplate(dimensions={'X': None}) - var_12 = {'_dimensions': ['X'], '_data': np.arange(12)} - template.variables = {'X': var_12} + template = DatasetTemplate(dimensions={"X": None}) + var_12 = {"_dimensions": ["X"], "_data": np.arange(12)} + template.variables = {"X": var_12} template.ensure_consistency() - self.assertEqual({'X': 12}, template.dimensions) - self.assertIs(var_12, template.variables['X']) + self.assertEqual({"X": 12}, template.dimensions) + self.assertIs(var_12, template.variables["X"]) - empty = {'_dimensions': ['X'], '_data': None} - template.variables['EMPTY'] = empty + empty = {"_dimensions": ["X"], "_data": None} + template.variables["EMPTY"] = empty template.ensure_consistency() - self.assertEqual({'X': 12}, template.dimensions) - self.assertIs(empty, template.variables['EMPTY']) + self.assertEqual({"X": 12}, template.dimensions) + self.assertIs(empty, template.variables["EMPTY"]) - template.variables['X']['_data'] = self.values1 - self.assertRaisesRegexp(ValueError, 'inconsistent with dimension sizes defined in template', - template.ensure_consistency) # now should fail because dim X is already set + template.variables["X"]["_data"] = self.values1 + self.assertRaisesRegexp( + ValueError, + "inconsistent with dimension sizes defined in template", + template.ensure_consistency, + ) # now should fail because dim X is already set template.variables = { - 'Z': {'_dimensions': ["NOSUCHTHING"], '_data': self.values10} + "Z": {"_dimensions": ["NOSUCHTHING"], "_data": self.values10} } - self.assertRaisesRegexp(ValidationError, 'undefined dimensions', template.ensure_consistency) + self.assertRaisesRegexp( + ValidationError, "undefined dimensions", template.ensure_consistency + ) template.variables = { - 'W': {'_dimensions': ['X'], '_data': np.arange(20).reshape((10,2))} + "W": {"_dimensions": ["X"], "_data": np.arange(20).reshape((10, 2))} } - self.assertRaisesRegexp(ValueError, - "Variable 'W' has 1 dimensions, but value array has 2 dimensions.", - template.ensure_consistency - ) + self.assertRaisesRegexp( + ValueError, + "Variable 'W' has 1 dimensions, but value array has 2 dimensions.", + template.ensure_consistency, + ) class TestDataValues(TemplateTestCase): def setUp(self): super(TestDataValues, self).setUp() - self.data_array = np.array([-999., -999., -999., -999., -999., 1., 2., 3., 4., 5]) - self.data_masked = np.ma.masked_array([-4, -3, -2, -1, 0, 1., 2., 3., 4., 5], - mask=[True, True, True, True, True, False, False, False, False, False]) + self.data_array = np.array( + [-999.0, -999.0, -999.0, -999.0, -999.0, 1.0, 2.0, 3.0, 4.0, 5] + ) + self.data_masked = np.ma.masked_array( + [-4, -3, -2, -1, 0, 1.0, 2.0, 3.0, 4.0, 5], + mask=[True, True, True, True, True, False, False, False, False, False], + ) self.template = DatasetTemplate( - dimensions={'TIME': 10}, + dimensions={"TIME": 10}, variables={ - 'TIME': { - '_dimensions': ['TIME'], - '_datatype': 'float64', - 'valid_min': 0, - 'valid_max': 10, - '_data': np.array([np.nan, np.nan, 1, 2, 3, 4, 5, 6, 7, 8]) + "TIME": { + "_dimensions": ["TIME"], + "_datatype": "float64", + "valid_min": 0, + "valid_max": 10, + "_data": np.array([np.nan, np.nan, 1, 2, 3, 4, 5, 6, 7, 8]), + }, + "X": { + "_dimensions": ["TIME"], + "_datatype": "float32", + "valid_min": 1, + "valid_max": 5, + "_FillValue": -999, + "_data": self.data_array, }, - 'X': { - '_dimensions': ['TIME'], - '_datatype': 'float32', - 'valid_min': 1, - 'valid_max': 5, - '_FillValue': -999, - '_data': self.data_array + "Y": { + "_dimensions": ["TIME"], + "_datatype": "float32", + "valid_range": [-4, 5], + "_fill_value": -999, + "_data": self.data_masked, }, - 'Y': { - '_dimensions': ['TIME'], - '_datatype': 'float32', - 'valid_range': [-4, 5], - '_fill_value': -999, - '_data': self.data_masked + "N": { + "_dimensions": ["TIME"], + "_datatype": "int32", + "valid_range": [-4, 5], + "_fill_value": -999, + "_data": self.data_array, }, - 'N': { - '_dimensions': ['TIME'], - '_datatype': 'int32', - 'valid_range': [-4, 5], - '_fill_value': -999, - '_data': self.data_array - } - } + }, ) def test_fill_values(self): self.template.to_netcdf(self.temp_nc_file) dataset = Dataset(self.temp_nc_file) dataset.set_auto_mask(True) - for varname in ('X', 'Y'): + for varname in ("X", "Y"): dsvar = dataset.variables[varname] - self.assertEqual(-999., dsvar._FillValue) + self.assertEqual(-999.0, dsvar._FillValue) self.assertIsInstance(dsvar[:], np.ma.MaskedArray) self.assertTrue(dsvar[:5].mask.all()) self.assertTrue((dsvar[5:] == self.data_array[5:]).all()) def test_fill_value_aliases(self): - self.template.variables['X']['_fill_value'] = -999. # both aliases, but equal so should still work + self.template.variables["X"][ + "_fill_value" + ] = -999.0 # both aliases, but equal so should still work self.template.to_netcdf(self.temp_nc_file) dataset = Dataset(self.temp_nc_file) - self.assertEqual(-999., dataset.variables['X']._FillValue) + self.assertEqual(-999.0, dataset.variables["X"]._FillValue) del self._temp_nc_file - self.template.variables['X']['_fill_value'] = -666. # now they're different, which is an error + self.template.variables["X"][ + "_fill_value" + ] = -666.0 # now they're different, which is an error self.assertRaises(ValueError, self.template.to_netcdf, self.temp_nc_file) def test_get_data_range(self): - self.assertEqual((1, 8), self.template.get_data_range('TIME')) - self.assertEqual((1, 5), self.template.get_data_range('X')) - self.assertEqual((1, 5), self.template.get_data_range('Y')) + self.assertEqual((1, 8), self.template.get_data_range("TIME")) + self.assertEqual((1, 5), self.template.get_data_range("X")) + self.assertEqual((1, 5), self.template.get_data_range("Y")) def test_var_attr_datatype_conversion(self): """ @@ -406,20 +468,21 @@ def test_var_attr_datatype_conversion(self): self.template.to_netcdf(self.temp_nc_file) dataset = Dataset(self.temp_nc_file) - TIME = dataset.variables['TIME'] + TIME = dataset.variables["TIME"] self.assertEqual(TIME.dtype, TIME.valid_min.dtype) self.assertEqual(TIME.dtype, TIME.valid_max.dtype) - X = dataset.variables['X'] + X = dataset.variables["X"] self.assertEqual(X.dtype, X.valid_min.dtype) self.assertEqual(X.dtype, X.valid_max.dtype) self.assertEqual(X.dtype, X._FillValue.dtype) - for v in ['Y', 'N']: + for v in ["Y", "N"]: var = dataset.variables[v] self.assertEqual(var.dtype, var.valid_range.dtype) self.assertEqual(var.dtype, var._FillValue.dtype) + # TODO: add data from multiple numpy arrays # e.g. template.add_data(TIME=time_values, TEMP=temp_values, PRES=pres_values) # TODO: add data from Pandas dataframe (later...) @@ -429,5 +492,5 @@ def test_var_attr_datatype_conversion(self): # e.g. template.create() -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/test_aodntools/timeseries_products/test_aggregated_timeseries.py b/test_aodntools/timeseries_products/test_aggregated_timeseries.py index 9f79d6c..5bf80cc 100644 --- a/test_aodntools/timeseries_products/test_aggregated_timeseries.py +++ b/test_aodntools/timeseries_products/test_aggregated_timeseries.py @@ -11,59 +11,93 @@ from test_aodntools.base_test import BaseTestCase TEST_ROOT = os.path.dirname(__file__) -BAD_FILE = 'IMOS_ANMN-NRS_TZ_20181213T080000Z_NRSROT_FV00_NRSROT-1812-SBE39-43_END-20181214T004000Z_C-20190827T000000Z.nc' +BAD_FILE = "IMOS_ANMN-NRS_TZ_20181213T080000Z_NRSROT_FV00_NRSROT-1812-SBE39-43_END-20181214T004000Z_C-20190827T000000Z.nc" INPUT_FILES = [ - 'IMOS_ANMN-NRS_TZ_20181213T080000Z_NRSROT_FV01_NRSROT-1812-SBE39-23_END-20190306T160000Z_C-20190827T000000Z.nc', - 'IMOS_ANMN-NRS_TZ_20190313T144000Z_NRSROT_FV01_NRSROT-1903-SBE39-27_END-20190524T010000Z_C-20190827T000000Z.nc', - 'IMOS_ANMN-NRS_BCKOSTUZ_20181213T080038Z_NRSROT_FV01_NRSROT-1812-WQM-55_END-20181215T013118Z_C-20190828T000000Z.nc', - BAD_FILE + "IMOS_ANMN-NRS_TZ_20181213T080000Z_NRSROT_FV01_NRSROT-1812-SBE39-23_END-20190306T160000Z_C-20190827T000000Z.nc", + "IMOS_ANMN-NRS_TZ_20190313T144000Z_NRSROT_FV01_NRSROT-1903-SBE39-27_END-20190524T010000Z_C-20190827T000000Z.nc", + "IMOS_ANMN-NRS_BCKOSTUZ_20181213T080038Z_NRSROT_FV01_NRSROT-1812-WQM-55_END-20181215T013118Z_C-20190828T000000Z.nc", + BAD_FILE, ] class TestAggregatedTimeseries(BaseTestCase): EXPECTED_OUTPUT_FILE = os.path.join( - TEST_ROOT, 'IMOS_ANMN-NRS_TZ_20181213_NRSROT_FV01_TEMP-aggregated-timeseries_END-20190523_C-20220607.nc' + TEST_ROOT, + "IMOS_ANMN-NRS_TZ_20181213_NRSROT_FV01_TEMP-aggregated-timeseries_END-20190523_C-20220607.nc", ) def test_main_aggregator(self): - output_file, bad_files = main_aggregator(INPUT_FILES, 'TEMP', 'NRSROT', input_dir=TEST_ROOT, - output_dir='/tmp') + output_file, bad_files = main_aggregator( + INPUT_FILES, "TEMP", "NRSROT", input_dir=TEST_ROOT, output_dir="/tmp" + ) self.assertEqual(4, len(INPUT_FILES)) self.assertEqual(1, len(bad_files)) for file, errors in bad_files.items(): self.assertEqual(BAD_FILE, file) - self.assertSetEqual(set(errors), {'no NOMINAL_DEPTH', - 'Wrong file version: Level 0 - Raw Data', - 'no time_deployment_start attribute', - 'no time_deployment_end attribute' - } - ) + self.assertSetEqual( + set(errors), + { + "no NOMINAL_DEPTH", + "Wrong file version: Level 0 - Raw Data", + "no time_deployment_start attribute", + "no time_deployment_end attribute", + }, + ) dataset = Dataset(output_file) # check dimensions and variables - self.assertSetEqual(set(dataset.dimensions), {'OBSERVATION', 'INSTRUMENT', 'strlen'}) - self.assertSetEqual(set(dataset.variables.keys()), - {'TIME', 'LATITUDE', 'LONGITUDE', 'NOMINAL_DEPTH', 'DEPTH', 'DEPTH_quality_control', - 'PRES', 'PRES_quality_control', 'PRES_REL', 'PRES_REL_quality_control', - 'TEMP', 'TEMP_quality_control', 'instrument_index', 'instrument_id', 'source_file'} - ) - - obs_vars = {'TIME', 'DEPTH', 'DEPTH_quality_control', 'PRES', 'PRES_quality_control', - 'PRES_REL', 'PRES_REL_quality_control', 'TEMP', 'TEMP_quality_control', 'instrument_index'} + self.assertSetEqual( + set(dataset.dimensions), {"OBSERVATION", "INSTRUMENT", "strlen"} + ) + self.assertSetEqual( + set(dataset.variables.keys()), + { + "TIME", + "LATITUDE", + "LONGITUDE", + "NOMINAL_DEPTH", + "DEPTH", + "DEPTH_quality_control", + "PRES", + "PRES_quality_control", + "PRES_REL", + "PRES_REL_quality_control", + "TEMP", + "TEMP_quality_control", + "instrument_index", + "instrument_id", + "source_file", + }, + ) + + obs_vars = { + "TIME", + "DEPTH", + "DEPTH_quality_control", + "PRES", + "PRES_quality_control", + "PRES_REL", + "PRES_REL_quality_control", + "TEMP", + "TEMP_quality_control", + "instrument_index", + } for var in obs_vars: - self.assertEqual(dataset.variables[var].dimensions, ('OBSERVATION',)) + self.assertEqual(dataset.variables[var].dimensions, ("OBSERVATION",)) - inst_vars = {'LATITUDE', 'LONGITUDE', 'NOMINAL_DEPTH'} + inst_vars = {"LATITUDE", "LONGITUDE", "NOMINAL_DEPTH"} for var in inst_vars: - self.assertEqual(dataset.variables[var].dimensions, ('INSTRUMENT',)) + self.assertEqual(dataset.variables[var].dimensions, ("INSTRUMENT",)) - string_vars = {'source_file', 'instrument_id'} + string_vars = {"source_file", "instrument_id"} for var in string_vars: - self.assertEqual(dataset.variables[var].dimensions, ('INSTRUMENT', 'strlen')) + self.assertEqual( + dataset.variables[var].dimensions, ("INSTRUMENT", "strlen") + ) - for f in chartostring(dataset['source_file'][:]): + for f in chartostring(dataset["source_file"][:]): self.assertIn(f, INPUT_FILES) # check attributes @@ -78,18 +112,34 @@ def test_main_aggregator(self): self.compare_variables(dataset) def test_source_file_attributes(self): - output_file, bad_files = main_aggregator(INPUT_FILES, 'PSAL', 'NRSROT', input_dir=TEST_ROOT, - output_dir='/tmp', download_url_prefix='http://test.download.url', - opendap_url_prefix='http://test.opendap.url' - ) + output_file, bad_files = main_aggregator( + INPUT_FILES, + "PSAL", + "NRSROT", + input_dir=TEST_ROOT, + output_dir="/tmp", + download_url_prefix="http://test.download.url", + opendap_url_prefix="http://test.opendap.url", + ) dataset = Dataset(output_file) - self.assertEqual(dataset['source_file'].download_url_prefix, 'http://test.download.url') - self.assertEqual(dataset['source_file'].opendap_url_prefix, 'http://test.opendap.url') + self.assertEqual( + dataset["source_file"].download_url_prefix, "http://test.download.url" + ) + self.assertEqual( + dataset["source_file"].opendap_url_prefix, "http://test.opendap.url" + ) def test_all_rejected(self): - self.assertRaises(NoInputFilesError, main_aggregator, [BAD_FILE], 'TEMP', 'NRSROT', - input_dir=TEST_ROOT, output_dir='/tmp') - - -if __name__ == '__main__': + self.assertRaises( + NoInputFilesError, + main_aggregator, + [BAD_FILE], + "TEMP", + "NRSROT", + input_dir=TEST_ROOT, + output_dir="/tmp", + ) + + +if __name__ == "__main__": unittest.main() diff --git a/test_aodntools/timeseries_products/test_common.py b/test_aodntools/timeseries_products/test_common.py index 382d553..6033e33 100644 --- a/test_aodntools/timeseries_products/test_common.py +++ b/test_aodntools/timeseries_products/test_common.py @@ -5,24 +5,32 @@ import xarray as xr -from aodntools.timeseries_products.common import (check_file, check_velocity_file, get_qc_variable_names, - check_imos_flag_conventions, in_water_index, in_water) +from aodntools.timeseries_products.common import ( + check_file, + check_velocity_file, + get_qc_variable_names, + check_imos_flag_conventions, + in_water_index, + in_water, +) TEST_ROOT = os.path.dirname(__file__) GOOD_TZ_FILE = os.path.join( - TEST_ROOT, 'IMOS_ANMN-NRS_TZ_20181213T080000Z_NRSROT_FV01_NRSROT-1812-SBE39-23_END-20190306T160000Z_C-20190827T000000Z.nc' + TEST_ROOT, + "IMOS_ANMN-NRS_TZ_20181213T080000Z_NRSROT_FV01_NRSROT-1812-SBE39-23_END-20190306T160000Z_C-20190827T000000Z.nc", ) GOOD_V_FILE = os.path.join( - TEST_ROOT, 'IMOS_ANMN-NRS_AETVZ_20181213T080000Z_NRSROT-ADCP_FV01_NRSROT-ADCP-1812-Sentinel-or-Monitor-Workhorse-ADCP-44_END-20181215T100000Z_C-20200430T000000Z.nc' + TEST_ROOT, + "IMOS_ANMN-NRS_AETVZ_20181213T080000Z_NRSROT-ADCP_FV01_NRSROT-ADCP-1812-Sentinel-or-Monitor-Workhorse-ADCP-44_END-20181215T100000Z_C-20200430T000000Z.nc", ) BAD_TZ_FILE = os.path.join( - TEST_ROOT, 'IMOS_ANMN-NRS_TZ_20181213T080000Z_NRSROT_FV00_NRSROT-1812-SBE39-43_END-20181214T004000Z_C-20190827T000000Z.nc' -) -BAD_V_FILE = os.path.join( - TEST_ROOT, 'IMOS_ANMN-NRS_BAD_VELOCITY_FILE.nc' + TEST_ROOT, + "IMOS_ANMN-NRS_TZ_20181213T080000Z_NRSROT_FV00_NRSROT-1812-SBE39-43_END-20181214T004000Z_C-20190827T000000Z.nc", ) +BAD_V_FILE = os.path.join(TEST_ROOT, "IMOS_ANMN-NRS_BAD_VELOCITY_FILE.nc") AM_FILE = os.path.join( - TEST_ROOT, 'IMOS_ANMN-AM_GST_20190419T100000Z_NRSMAI_FV01_NRSMAI-CO2-1904-delayed_END-20190531T020000Z_C-20200625T000000Z.nc' + TEST_ROOT, + "IMOS_ANMN-AM_GST_20190419T100000Z_NRSMAI_FV01_NRSMAI-CO2-1904-delayed_END-20190531T020000Z_C-20200625T000000Z.nc", ) @@ -30,77 +38,98 @@ class TestQCVariableFunctions(unittest.TestCase): def test_get_qc_variable_names(self): with xr.open_dataset(GOOD_TZ_FILE) as nc: qc_var = get_qc_variable_names(nc) - self.assertEqual(qc_var, ['DEPTH_quality_control', 'TEMP_quality_control']) + self.assertEqual(qc_var, ["DEPTH_quality_control", "TEMP_quality_control"]) def test_check_flag_convetions(self): with xr.open_dataset(GOOD_TZ_FILE) as nc: self.assertEqual(check_imos_flag_conventions(nc), []) - self.assertEqual(check_imos_flag_conventions(nc, ['NONE']), - ['variable NONE not in file']) - self.assertEqual(check_imos_flag_conventions(nc, ['TEMP']), - ['variable TEMP missing quality_control_conventions']) + self.assertEqual( + check_imos_flag_conventions(nc, ["NONE"]), ["variable NONE not in file"] + ) + self.assertEqual( + check_imos_flag_conventions(nc, ["TEMP"]), + ["variable TEMP missing quality_control_conventions"], + ) def test_check_flag_conventions_bad(self): with xr.open_dataset(AM_FILE) as nc: errors = check_imos_flag_conventions(nc) - self.assertEqual(errors, ['unexpected quality_control_conventions: "WOCE quality control procedure"']) + self.assertEqual( + errors, + [ + 'unexpected quality_control_conventions: "WOCE quality control procedure"' + ], + ) class TestCheckFile(unittest.TestCase): def test_good_temp_file(self): with xr.open_dataset(GOOD_TZ_FILE) as nc: - error_list = check_file(nc, 'NRSROT', 'TEMP') + error_list = check_file(nc, "NRSROT", "TEMP") self.assertEqual(error_list, []) def test_variable_list(self): with xr.open_dataset(GOOD_TZ_FILE) as nc: - error_list = check_file(nc, 'NRSROT', ['TEMP', 'PSAL', 'DEPTH']) + error_list = check_file(nc, "NRSROT", ["TEMP", "PSAL", "DEPTH"]) self.assertEqual(error_list, []) def test_wrong_site_and_var(self): with xr.open_dataset(GOOD_TZ_FILE) as nc: - error_list = check_file(nc, 'NO_SITE', 'OTHER') - self.assertEqual(set(error_list), {'Wrong site_code: NRSROT', 'no variables to aggregate'}) + error_list = check_file(nc, "NO_SITE", "OTHER") + self.assertEqual( + set(error_list), {"Wrong site_code: NRSROT", "no variables to aggregate"} + ) def test_bad_temp_file(self): with xr.open_dataset(BAD_TZ_FILE) as nc: - error_list = check_file(nc, 'NRSROT', 'TEMP') - self.assertEqual(set(error_list), - {'no NOMINAL_DEPTH', 'Wrong file version: Level 0 - Raw Data', - 'no time_deployment_start attribute', 'no time_deployment_end attribute'} - ) + error_list = check_file(nc, "NRSROT", "TEMP") + self.assertEqual( + set(error_list), + { + "no NOMINAL_DEPTH", + "Wrong file version: Level 0 - Raw Data", + "no time_deployment_start attribute", + "no time_deployment_end attribute", + }, + ) def test_good_velocity_file(self): with xr.open_dataset(GOOD_V_FILE) as nc: - error_list = check_velocity_file(nc, 'NRSROT') + error_list = check_velocity_file(nc, "NRSROT") self.assertEqual(error_list, []) def test_bad_velocity_file(self): with xr.open_dataset(BAD_V_FILE) as nc: - error_list = check_velocity_file(nc, 'NWSROW') - self.assertEqual(set(error_list), {'VCUR variable missing', - 'DEPTH variable missing', - "dimension(s) {'DIST_ALONG_BEAMS'} not allowed for UCUR", - 'no in-water data' - } - ) + error_list = check_velocity_file(nc, "NWSROW") + self.assertEqual( + set(error_list), + { + "VCUR variable missing", + "DEPTH variable missing", + "dimension(s) {'DIST_ALONG_BEAMS'} not allowed for UCUR", + "no in-water data", + }, + ) def test_am_file(self): with xr.open_dataset(AM_FILE) as nc: - error_list = check_file(nc, 'NRSMAI', 'TEMP') - self.assertEqual(set(error_list), {'no NOMINAL_DEPTH', - 'no time_deployment_start attribute', - 'no time_deployment_end attribute', - 'unexpected quality_control_conventions: "WOCE quality control procedure"' - } - ) + error_list = check_file(nc, "NRSMAI", "TEMP") + self.assertEqual( + set(error_list), + { + "no NOMINAL_DEPTH", + "no time_deployment_start attribute", + "no time_deployment_end attribute", + 'unexpected quality_control_conventions: "WOCE quality control procedure"', + }, + ) class TestInWater(unittest.TestCase): def test_in_water_index_ok(self): with xr.open_dataset(BAD_TZ_FILE) as nc: - nc.attrs['time_deployment_start'] = '2018-12-13T08:00:00Z' - nc.attrs['time_deployment_end'] = '2018-12-14T00:30:00Z' + nc.attrs["time_deployment_start"] = "2018-12-13T08:00:00Z" + nc.attrs["time_deployment_end"] = "2018-12-14T00:30:00Z" index = in_water_index(nc) self.assertTrue(all(index[:-2])) self.assertFalse(any(index[-2:])) @@ -112,12 +141,13 @@ def test_in_water_index_bad(self): def test_in_water_ok(self): with xr.open_dataset(BAD_TZ_FILE) as nc: - nc.attrs['time_deployment_start'] = '2018-12-13T08:00:00Z' - nc.attrs['time_deployment_end'] = '2018-12-14T00:30:00Z' + nc.attrs["time_deployment_start"] = "2018-12-13T08:00:00Z" + nc.attrs["time_deployment_end"] = "2018-12-14T00:30:00Z" nc_in = in_water(nc) self.assertEqual(len(nc_in.TIME), len(nc.TIME) - 2) self.assertTrue(all(nc_in.TIME.values == nc.TIME[:-2].values)) -if __name__ == '__main__': + +if __name__ == "__main__": unittest.main() diff --git a/test_aodntools/timeseries_products/test_hourly_timeseries.py b/test_aodntools/timeseries_products/test_hourly_timeseries.py index 9e6a7d4..7d15c32 100644 --- a/test_aodntools/timeseries_products/test_hourly_timeseries.py +++ b/test_aodntools/timeseries_products/test_hourly_timeseries.py @@ -12,80 +12,116 @@ TEST_ROOT = os.path.dirname(__file__) -BAD_FILE = 'IMOS_ANMN-NRS_TZ_20181213T080000Z_NRSROT_FV00_NRSROT-1812-SBE39-43_END-20181214T004000Z_C-20190827T000000Z.nc' +BAD_FILE = "IMOS_ANMN-NRS_TZ_20181213T080000Z_NRSROT_FV00_NRSROT-1812-SBE39-43_END-20181214T004000Z_C-20190827T000000Z.nc" INPUT_FILES = [ - 'IMOS_ANMN-NRS_BCKOSTUZ_20181213T080038Z_NRSROT_FV01_NRSROT-1812-WQM-55_END-20181215T013118Z_C-20190828T000000Z.nc', - 'IMOS_ANMN-NRS_TZ_20181213T080000Z_NRSROT_FV01_NRSROT-1812-SBE39-23_END-20190306T160000Z_C-20190827T000000Z.nc', - 'IMOS_ANMN-NRS_TZ_20190313T144000Z_NRSROT_FV01_NRSROT-1903-SBE39-27_END-20190524T010000Z_C-20190827T000000Z.nc', - BAD_FILE + "IMOS_ANMN-NRS_BCKOSTUZ_20181213T080038Z_NRSROT_FV01_NRSROT-1812-WQM-55_END-20181215T013118Z_C-20190828T000000Z.nc", + "IMOS_ANMN-NRS_TZ_20181213T080000Z_NRSROT_FV01_NRSROT-1812-SBE39-23_END-20190306T160000Z_C-20190827T000000Z.nc", + "IMOS_ANMN-NRS_TZ_20190313T144000Z_NRSROT_FV01_NRSROT-1903-SBE39-27_END-20190524T010000Z_C-20190827T000000Z.nc", + BAD_FILE, ] INPUT_PATHS = [os.path.join(TEST_ROOT, f) for f in INPUT_FILES] -INST_VARIABLES = {'instrument_id', 'source_file', 'LONGITUDE', 'LATITUDE', 'NOMINAL_DEPTH'} -OBS_VARIABLES = {'instrument_index', 'TIME'} -measured_variables = {'DEPTH', 'CPHL', 'CHLF', 'CHLU', - 'DOX', 'DOX1', 'DOX1_2', 'DOX1_3', 'DOX2', 'DOX2_1', 'DOXS', 'DOXY', - 'PRES', 'PRES_REL', 'PSAL', 'TEMP', 'TURB', 'PAR' - } -function_stats = ['_min', '_max', '_std', '_count'] +INST_VARIABLES = { + "instrument_id", + "source_file", + "LONGITUDE", + "LATITUDE", + "NOMINAL_DEPTH", +} +OBS_VARIABLES = {"instrument_index", "TIME"} +measured_variables = { + "DEPTH", + "CPHL", + "CHLF", + "CHLU", + "DOX", + "DOX1", + "DOX1_2", + "DOX1_3", + "DOX2", + "DOX2_1", + "DOXS", + "DOXY", + "PRES", + "PRES_REL", + "PSAL", + "TEMP", + "TURB", + "PAR", +} +function_stats = ["_min", "_max", "_std", "_count"] for v in measured_variables: OBS_VARIABLES.add(v) for s in function_stats: OBS_VARIABLES.add(v + s) -NO_INWATER_DATA_FILE = 'IMOS_ANMN-NSW_TZ_PH100_NO_INWATER_DATA.nc' +NO_INWATER_DATA_FILE = "IMOS_ANMN-NSW_TZ_PH100_NO_INWATER_DATA.nc" PH100_FILES = [ - 'IMOS_ANMN-NSW_TZ_20200703T001500Z_PH100_FV01_PH100-2007-Aqualogger-520T-96_END-20200907T233000Z_C-20210112T044909Z.nc', - 'IMOS_ANMN-NSW_TZ_PH100_ALL_FLAGGED_BAD.nc', - NO_INWATER_DATA_FILE + "IMOS_ANMN-NSW_TZ_20200703T001500Z_PH100_FV01_PH100-2007-Aqualogger-520T-96_END-20200907T233000Z_C-20210112T044909Z.nc", + "IMOS_ANMN-NSW_TZ_PH100_ALL_FLAGGED_BAD.nc", + NO_INWATER_DATA_FILE, ] SYD100_FILES = [ - 'IMOS_ANMN-NSW_TZ_SYD100_BAD_TIMESTAMPS.nc', + "IMOS_ANMN-NSW_TZ_SYD100_BAD_TIMESTAMPS.nc", ] class TestHourlyTimeseries(BaseTestCase): EXPECTED_OUTPUT_FILE = os.path.join( - TEST_ROOT, 'IMOS_ANMN-NRS_STZ_20181213_NRSROT_FV02_hourly-timeseries_END-20190523_C-20220428.nc' + TEST_ROOT, + "IMOS_ANMN-NRS_STZ_20181213_NRSROT_FV02_hourly-timeseries_END-20190523_C-20220428.nc", ) def test_hourly_aggregator(self): - output_file, bad_files = hourly_aggregator(files_to_aggregate=INPUT_PATHS, - site_code='NRSROT', - qcflags=(1, 2), - output_dir='/tmp' - ) - self.assertRegex(output_file, - r'IMOS_ANMN-NRS_STZ_20181213_NRSROT_FV02_hourly-timeseries_END-20190523_C-\d{8}\.nc' - ) + output_file, bad_files = hourly_aggregator( + files_to_aggregate=INPUT_PATHS, + site_code="NRSROT", + qcflags=(1, 2), + output_dir="/tmp", + ) + self.assertRegex( + output_file, + r"IMOS_ANMN-NRS_STZ_20181213_NRSROT_FV02_hourly-timeseries_END-20190523_C-\d{8}\.nc", + ) self.assertEqual(1, len(bad_files)) for path, errors in bad_files.items(): self.assertEqual(os.path.join(TEST_ROOT, BAD_FILE), path) - self.assertSetEqual(set(errors), {'no NOMINAL_DEPTH', - 'Wrong file version: Level 0 - Raw Data', - 'no time_deployment_start attribute', - 'no time_deployment_end attribute' - } - ) + self.assertSetEqual( + set(errors), + { + "no NOMINAL_DEPTH", + "Wrong file version: Level 0 - Raw Data", + "no time_deployment_start attribute", + "no time_deployment_end attribute", + }, + ) dataset = Dataset(output_file) - self.assertSetEqual(set(dataset.dimensions), {'OBSERVATION', 'INSTRUMENT', 'string256'}) - self.assertTrue(set(dataset.variables.keys()).issubset(OBS_VARIABLES | INST_VARIABLES)) - - inst_variables = {n for n, v in dataset.variables.items() if v.dimensions[0] == 'INSTRUMENT'} + self.assertSetEqual( + set(dataset.dimensions), {"OBSERVATION", "INSTRUMENT", "string256"} + ) + self.assertTrue( + set(dataset.variables.keys()).issubset(OBS_VARIABLES | INST_VARIABLES) + ) + + inst_variables = { + n for n, v in dataset.variables.items() if v.dimensions[0] == "INSTRUMENT" + } self.assertSetEqual(inst_variables, INST_VARIABLES) - obs_variables = {n for n, v in dataset.variables.items() if v.dimensions == ('OBSERVATION',)} + obs_variables = { + n for n, v in dataset.variables.items() if v.dimensions == ("OBSERVATION",) + } self.assertTrue(obs_variables.issubset(OBS_VARIABLES)) - for f in chartostring(dataset['source_file'][:]): + for f in chartostring(dataset["source_file"][:]): self.assertIn(f, INPUT_PATHS) # check metadata self.assertEqual(__version__, dataset.generating_code_version) self.assertIn(__version__, dataset.lineage) - self.assertIn('hourly_timeseries.py', dataset.lineage) + self.assertIn("hourly_timeseries.py", dataset.lineage) self.assertIn(BAD_FILE, dataset.rejected_files) self.compare_global_attributes(dataset) @@ -95,64 +131,81 @@ def test_hourly_aggregator(self): self.compare_variables(dataset) def test_hourly_aggregator_with_nonqc(self): - output_file, bad_files = hourly_aggregator(files_to_aggregate=INPUT_FILES, - site_code='NRSROT', - qcflags=(0, 1, 2), - input_dir=TEST_ROOT, - output_dir='/tmp', - download_url_prefix='http://test.download.url', - opendap_url_prefix='http://test.opendap.url' - ) - self.assertRegex(output_file, - r'IMOS_ANMN-NRS_BOSTUZ_20181213_NRSROT_FV02_hourly-timeseries-including-non-QC' - r'_END-20190523_C-\d{8}\.nc' - ) + output_file, bad_files = hourly_aggregator( + files_to_aggregate=INPUT_FILES, + site_code="NRSROT", + qcflags=(0, 1, 2), + input_dir=TEST_ROOT, + output_dir="/tmp", + download_url_prefix="http://test.download.url", + opendap_url_prefix="http://test.opendap.url", + ) + self.assertRegex( + output_file, + r"IMOS_ANMN-NRS_BOSTUZ_20181213_NRSROT_FV02_hourly-timeseries-including-non-QC" + r"_END-20190523_C-\d{8}\.nc", + ) dataset = Dataset(output_file) - self.assertEqual(dataset['source_file'].download_url_prefix, 'http://test.download.url') - self.assertEqual(dataset['source_file'].opendap_url_prefix, 'http://test.opendap.url') - for f in chartostring(dataset['source_file'][:]): + self.assertEqual( + dataset["source_file"].download_url_prefix, "http://test.download.url" + ) + self.assertEqual( + dataset["source_file"].opendap_url_prefix, "http://test.opendap.url" + ) + for f in chartostring(dataset["source_file"][:]): self.assertIn(f, INPUT_FILES) def test_with_adcp(self): # Replace the BAD_FILE with an ADCP file - aggregation should work (only takes TEMP from the ADCP) - input_files = INPUT_FILES[:2] + \ - ['IMOS_ANMN-NRS_AETVZ_20180816T080000Z_NRSROT-ADCP_FV01_NRSROT-ADCP-1808-Sentinel-or-Monitor-Workhorse-ADCP-44_END-20180822T053000Z_C-20200623T000000Z.nc'] - output_file, bad_files = hourly_aggregator(files_to_aggregate=input_files, - site_code='NRSROT', - qcflags=(1, 2), - input_dir=TEST_ROOT, - output_dir='/tmp' - ) + input_files = INPUT_FILES[:2] + [ + "IMOS_ANMN-NRS_AETVZ_20180816T080000Z_NRSROT-ADCP_FV01_NRSROT-ADCP-1808-Sentinel-or-Monitor-Workhorse-ADCP-44_END-20180822T053000Z_C-20200623T000000Z.nc" + ] + output_file, bad_files = hourly_aggregator( + files_to_aggregate=input_files, + site_code="NRSROT", + qcflags=(1, 2), + input_dir=TEST_ROOT, + output_dir="/tmp", + ) self.assertEqual(0, len(bad_files)) def test_all_rejected(self): - self.assertRaises(NoInputFilesError, hourly_aggregator, [BAD_FILE], 'NRSROT', (1, 2), input_dir=TEST_ROOT) + self.assertRaises( + NoInputFilesError, + hourly_aggregator, + [BAD_FILE], + "NRSROT", + (1, 2), + input_dir=TEST_ROOT, + ) def test_some_files_without_good_data(self): - output_file, bad_files = hourly_aggregator(files_to_aggregate=PH100_FILES, - site_code='PH100', - qcflags=(1, 2), - input_dir=TEST_ROOT, - output_dir='/tmp' - ) + output_file, bad_files = hourly_aggregator( + files_to_aggregate=PH100_FILES, + site_code="PH100", + qcflags=(1, 2), + input_dir=TEST_ROOT, + output_dir="/tmp", + ) self.assertEqual(1, len(bad_files)) for path, errors in bad_files.items(): self.assertEqual(NO_INWATER_DATA_FILE, path) - self.assertIn('no in-water data', errors) + self.assertIn("no in-water data", errors) with Dataset(output_file) as dataset: self.check_nan_values(dataset) def test_bad_timestamps(self): - output_file, bad_files = hourly_aggregator(files_to_aggregate=SYD100_FILES, - site_code='SYD100', - qcflags=(1, 2), - input_dir=TEST_ROOT, - output_dir='/tmp' - ) + output_file, bad_files = hourly_aggregator( + files_to_aggregate=SYD100_FILES, + site_code="SYD100", + qcflags=(1, 2), + input_dir=TEST_ROOT, + output_dir="/tmp", + ) self.assertEqual(0, len(bad_files)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/test_aodntools/timeseries_products/test_velocity_aggregated_timeseries.py b/test_aodntools/timeseries_products/test_velocity_aggregated_timeseries.py index 856e17f..35ad5c8 100644 --- a/test_aodntools/timeseries_products/test_velocity_aggregated_timeseries.py +++ b/test_aodntools/timeseries_products/test_velocity_aggregated_timeseries.py @@ -7,31 +7,47 @@ from aodntools import __version__ from aodntools.timeseries_products.common import NoInputFilesError -from aodntools.timeseries_products.velocity_aggregated_timeseries import velocity_aggregated +from aodntools.timeseries_products.velocity_aggregated_timeseries import ( + velocity_aggregated, +) from test_aodntools.base_test import BaseTestCase TEST_ROOT = os.path.dirname(__file__) -BAD_FILE = 'IMOS_ANMN-NRS_BAD_VELOCITY_FILE.nc' +BAD_FILE = "IMOS_ANMN-NRS_BAD_VELOCITY_FILE.nc" INPUT_FILES = [ - 'IMOS_ANMN-NRS_AETVZ_20181213T080000Z_NRSROT-ADCP_FV01_NRSROT-ADCP-1812-Sentinel-or-Monitor-Workhorse-ADCP-44_END-20181215T100000Z_C-20200430T000000Z.nc', - 'IMOS_ANMN-NRS_AETVZ_20180816T080000Z_NRSROT-ADCP_FV01_NRSROT-ADCP-1808-Sentinel-or-Monitor-Workhorse-ADCP-44_END-20180822T053000Z_C-20200623T000000Z.nc', - 'IMOS_ANMN-NRS_AETVZ_20191016T080000Z_NRSROT-ADCP_FV01_NRSROT-ADCP-1910-Sentinel-or-Monitor-Workhorse-ADCP-44_END-20191018T100000Z_C-20200430T000000Z.nc', - BAD_FILE + "IMOS_ANMN-NRS_AETVZ_20181213T080000Z_NRSROT-ADCP_FV01_NRSROT-ADCP-1812-Sentinel-or-Monitor-Workhorse-ADCP-44_END-20181215T100000Z_C-20200430T000000Z.nc", + "IMOS_ANMN-NRS_AETVZ_20180816T080000Z_NRSROT-ADCP_FV01_NRSROT-ADCP-1808-Sentinel-or-Monitor-Workhorse-ADCP-44_END-20180822T053000Z_C-20200623T000000Z.nc", + "IMOS_ANMN-NRS_AETVZ_20191016T080000Z_NRSROT-ADCP_FV01_NRSROT-ADCP-1910-Sentinel-or-Monitor-Workhorse-ADCP-44_END-20191018T100000Z_C-20200430T000000Z.nc", + BAD_FILE, ] -OBS_VARS = {'TIME', 'DEPTH', 'DEPTH_quality_control', 'UCUR', 'UCUR_quality_control', - 'VCUR', 'VCUR_quality_control', 'WCUR', 'WCUR_quality_control', 'instrument_index', 'CELL_INDEX'} -INST_VARS = {'LATITUDE', 'LONGITUDE', 'NOMINAL_DEPTH', 'SECONDS_TO_MIDDLE'} -STR_VARS = {'source_file', 'instrument_id'} +OBS_VARS = { + "TIME", + "DEPTH", + "DEPTH_quality_control", + "UCUR", + "UCUR_quality_control", + "VCUR", + "VCUR_quality_control", + "WCUR", + "WCUR_quality_control", + "instrument_index", + "CELL_INDEX", +} +INST_VARS = {"LATITUDE", "LONGITUDE", "NOMINAL_DEPTH", "SECONDS_TO_MIDDLE"} +STR_VARS = {"source_file", "instrument_id"} class TestVelocityAggregatedTimeseries(BaseTestCase): EXPECTED_OUTPUT_FILE = os.path.join( - TEST_ROOT, 'IMOS_ANMN-NRS_VZ_20180816_NRSROT_FV01_velocity-aggregated-timeseries_END-20191018_C-20200623.nc' + TEST_ROOT, + "IMOS_ANMN-NRS_VZ_20180816_NRSROT_FV01_velocity-aggregated-timeseries_END-20191018_C-20200623.nc", ) def test_velocity_aggregated(self): - output_file, bad_files = velocity_aggregated(INPUT_FILES, 'NRSROT', input_dir=TEST_ROOT, output_dir='/tmp') + output_file, bad_files = velocity_aggregated( + INPUT_FILES, "NRSROT", input_dir=TEST_ROOT, output_dir="/tmp" + ) self.assertEqual(4, len(INPUT_FILES)) self.assertEqual(1, len(bad_files)) @@ -41,17 +57,23 @@ def test_velocity_aggregated(self): dataset = Dataset(output_file) # check dimensions and variables - self.assertSetEqual(set(dataset.dimensions), {'OBSERVATION', 'INSTRUMENT', 'strlen'}) - self.assertSetEqual(set(dataset.variables.keys()), OBS_VARS | INST_VARS | STR_VARS) + self.assertSetEqual( + set(dataset.dimensions), {"OBSERVATION", "INSTRUMENT", "strlen"} + ) + self.assertSetEqual( + set(dataset.variables.keys()), OBS_VARS | INST_VARS | STR_VARS + ) for var in OBS_VARS: - self.assertEqual(dataset.variables[var].dimensions, ('OBSERVATION',)) + self.assertEqual(dataset.variables[var].dimensions, ("OBSERVATION",)) for var in INST_VARS: - self.assertEqual(dataset.variables[var].dimensions, ('INSTRUMENT',)) + self.assertEqual(dataset.variables[var].dimensions, ("INSTRUMENT",)) for var in STR_VARS: - self.assertEqual(dataset.variables[var].dimensions, ('INSTRUMENT', 'strlen')) + self.assertEqual( + dataset.variables[var].dimensions, ("INSTRUMENT", "strlen") + ) - for f in chartostring(dataset['source_file'][:]): + for f in chartostring(dataset["source_file"][:]): self.assertIn(f, INPUT_FILES) # check attributes @@ -65,9 +87,15 @@ def test_velocity_aggregated(self): self.compare_variables(dataset) def test_all_rejected(self): - self.assertRaises(NoInputFilesError, velocity_aggregated, [BAD_FILE], 'NRSROT', - input_dir=TEST_ROOT, output_dir='/tmp') + self.assertRaises( + NoInputFilesError, + velocity_aggregated, + [BAD_FILE], + "NRSROT", + input_dir=TEST_ROOT, + output_dir="/tmp", + ) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/test_aodntools/timeseries_products/test_velocity_hourly_timeseries.py b/test_aodntools/timeseries_products/test_velocity_hourly_timeseries.py index d527f93..5181a00 100644 --- a/test_aodntools/timeseries_products/test_velocity_hourly_timeseries.py +++ b/test_aodntools/timeseries_products/test_velocity_hourly_timeseries.py @@ -8,35 +8,39 @@ from aodntools import __version__ from aodntools.timeseries_products.common import NoInputFilesError -from aodntools.timeseries_products.velocity_hourly_timeseries import velocity_hourly_aggregated +from aodntools.timeseries_products.velocity_hourly_timeseries import ( + velocity_hourly_aggregated, +) from test_aodntools.base_test import BaseTestCase TEST_ROOT = os.path.dirname(__file__) -BAD_FILE = 'IMOS_ANMN-NRS_BAD_VELOCITY_FILE.nc' +BAD_FILE = "IMOS_ANMN-NRS_BAD_VELOCITY_FILE.nc" INPUT_FILES = [ - 'IMOS_ANMN-NRS_AETVZ_20181213T080000Z_NRSROT-ADCP_FV01_NRSROT-ADCP-1812-Sentinel-or-Monitor-Workhorse-ADCP-44_END-20181215T100000Z_C-20200430T000000Z.nc', - 'IMOS_ANMN-NRS_AETVZ_20180816T080000Z_NRSROT-ADCP_FV01_NRSROT-ADCP-1808-Sentinel-or-Monitor-Workhorse-ADCP-44_END-20180822T053000Z_C-20200623T000000Z.nc', - 'IMOS_ANMN-NRS_AETVZ_20191016T080000Z_NRSROT-ADCP_FV01_NRSROT-ADCP-1910-Sentinel-or-Monitor-Workhorse-ADCP-44_END-20191018T100000Z_C-20200430T000000Z.nc', - BAD_FILE + "IMOS_ANMN-NRS_AETVZ_20181213T080000Z_NRSROT-ADCP_FV01_NRSROT-ADCP-1812-Sentinel-or-Monitor-Workhorse-ADCP-44_END-20181215T100000Z_C-20200430T000000Z.nc", + "IMOS_ANMN-NRS_AETVZ_20180816T080000Z_NRSROT-ADCP_FV01_NRSROT-ADCP-1808-Sentinel-or-Monitor-Workhorse-ADCP-44_END-20180822T053000Z_C-20200623T000000Z.nc", + "IMOS_ANMN-NRS_AETVZ_20191016T080000Z_NRSROT-ADCP_FV01_NRSROT-ADCP-1910-Sentinel-or-Monitor-Workhorse-ADCP-44_END-20191018T100000Z_C-20200430T000000Z.nc", + BAD_FILE, ] -OBS_VARS = {'TIME', 'instrument_index', 'CELL_INDEX'} -INST_VARS = {'LATITUDE', 'LONGITUDE', 'NOMINAL_DEPTH', 'SECONDS_TO_MIDDLE'} -STR_VARS = {'source_file', 'instrument_id'} -for v in ['DEPTH', 'UCUR', 'VCUR', 'WCUR']: +OBS_VARS = {"TIME", "instrument_index", "CELL_INDEX"} +INST_VARS = {"LATITUDE", "LONGITUDE", "NOMINAL_DEPTH", "SECONDS_TO_MIDDLE"} +STR_VARS = {"source_file", "instrument_id"} +for v in ["DEPTH", "UCUR", "VCUR", "WCUR"]: OBS_VARS.add(v) - for s in ['_min', '_max', '_std', '_count']: + for s in ["_min", "_max", "_std", "_count"]: OBS_VARS.add(v + s) class TestVelocityHourlyTimeseries(BaseTestCase): EXPECTED_OUTPUT_FILE = os.path.join( - TEST_ROOT, 'IMOS_ANMN-NRS_VZ_20180816_NRSROT_FV02_velocity-hourly-timeseries_END-20191018_C-20220608.nc' + TEST_ROOT, + "IMOS_ANMN-NRS_VZ_20180816_NRSROT_FV02_velocity-hourly-timeseries_END-20191018_C-20220608.nc", ) def test_velocity_hourly(self): - output_file, bad_files = velocity_hourly_aggregated(INPUT_FILES, 'NRSROT', - input_dir=TEST_ROOT, output_dir='/tmp') + output_file, bad_files = velocity_hourly_aggregated( + INPUT_FILES, "NRSROT", input_dir=TEST_ROOT, output_dir="/tmp" + ) self.assertEqual(4, len(INPUT_FILES)) self.assertEqual(1, len(bad_files)) for file, errors in bad_files.items(): @@ -45,17 +49,23 @@ def test_velocity_hourly(self): dataset = Dataset(output_file) # check dimensions and variables - self.assertSetEqual(set(dataset.dimensions), {'OBSERVATION', 'INSTRUMENT', 'strlen'}) - self.assertSetEqual(set(dataset.variables.keys()), OBS_VARS | INST_VARS | STR_VARS) + self.assertSetEqual( + set(dataset.dimensions), {"OBSERVATION", "INSTRUMENT", "strlen"} + ) + self.assertSetEqual( + set(dataset.variables.keys()), OBS_VARS | INST_VARS | STR_VARS + ) for var in OBS_VARS: - self.assertEqual(dataset.variables[var].dimensions, ('OBSERVATION',)) + self.assertEqual(dataset.variables[var].dimensions, ("OBSERVATION",)) for var in INST_VARS: - self.assertEqual(dataset.variables[var].dimensions, ('INSTRUMENT',)) + self.assertEqual(dataset.variables[var].dimensions, ("INSTRUMENT",)) for var in STR_VARS: - self.assertEqual(dataset.variables[var].dimensions, ('INSTRUMENT', 'strlen')) + self.assertEqual( + dataset.variables[var].dimensions, ("INSTRUMENT", "strlen") + ) - for f in chartostring(dataset['source_file'][:]): + for f in chartostring(dataset["source_file"][:]): self.assertIn(f, INPUT_FILES) # check attributes @@ -69,19 +79,26 @@ def test_velocity_hourly(self): self.compare_variables(dataset) def test_all_rejected(self): - self.assertRaises(NoInputFilesError, velocity_hourly_aggregated, [BAD_FILE], 'NRSROT', - input_dir=TEST_ROOT, output_dir='/tmp') + self.assertRaises( + NoInputFilesError, + velocity_hourly_aggregated, + [BAD_FILE], + "NRSROT", + input_dir=TEST_ROOT, + output_dir="/tmp", + ) def test_size1_dimensions(self): input_files = [ - 'IMOS_ANMN-NRS_ADCP_LAT_LON_DIMS.nc', - 'IMOS_ANMN-NRS_ADCP_SINGLE_TIMESTAMP.nc' + "IMOS_ANMN-NRS_ADCP_LAT_LON_DIMS.nc", + "IMOS_ANMN-NRS_ADCP_SINGLE_TIMESTAMP.nc", ] - output_file, bad_files = velocity_hourly_aggregated(input_files, 'NRSROT', - input_dir=TEST_ROOT, output_dir='/tmp') + output_file, bad_files = velocity_hourly_aggregated( + input_files, "NRSROT", input_dir=TEST_ROOT, output_dir="/tmp" + ) self.assertEqual(0, len(bad_files)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main()