Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create hourly product in moorings products pipeline #206

Merged
merged 11 commits into from
Feb 24, 2020
39 changes: 35 additions & 4 deletions aodndata/moorings/products_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import os
import re

from owslib.fes import PropertyIsEqualTo, PropertyIsNotEqualTo, And
from owslib.fes import PropertyIsEqualTo, PropertyIsNotEqualTo, And, Or

from aodncore.pipeline import HandlerBase, PipelineFilePublishType, PipelineFile, FileType
from aodncore.pipeline.exceptions import (InvalidFileContentError, InvalidFileNameError, InvalidFileFormatError,
Expand All @@ -11,6 +11,7 @@
from aodncore.util.wfs import ogc_filter_to_string

from aodntools.timeseries_products.aggregated_timeseries import main_aggregator
from aodntools.timeseries_products.hourly_timeseries import hourly_aggregator

from aodndata.moorings.classifiers import MooringsFileClassifier

Expand Down Expand Up @@ -111,6 +112,7 @@ def __init__(self, *args, **kwargs):
self.input_file_collection = None
self.input_file_variables = None
self.excluded_files = dict()
self.product_qc_flags = [[1, 2], [0, 1, 2]]

mhidas marked this conversation as resolved.
Show resolved Hide resolved
def _read_manifest(self):
"""Read the manifest file and extract key parameters for product"""
Expand Down Expand Up @@ -180,10 +182,13 @@ def _get_input_files(self):
# TODO: Replace temp_dir above with cache_dir?

def _get_old_product_files(self):
"""Get a list of the currently published aggregated_timeseries files for the site being processed."""
"""Get a list of the currently published product files for the site being processed."""

filter_list = [PropertyIsEqualTo(propertyname='site_code', literal=self.product_site_code),
mhidas marked this conversation as resolved.
Show resolved Hide resolved
PropertyIsEqualTo(propertyname='data_category', literal='aggregated_timeseries')
Or([PropertyIsEqualTo(propertyname='data_category', literal='aggregated_timeseries'),
PropertyIsEqualTo(propertyname='data_category', literal='hourly_timeseries'),
PropertyIsEqualTo(propertyname='data_category', literal='gridded_timeseries')
])
]
wfs_features = self.get_wfs_features(filter_list, propertyname=['url'])

Expand All @@ -202,7 +207,7 @@ def _get_old_product_files(self):
)

def _make_aggregated_timeseries(self):
"""For each variable, generate product and add to file_collection."""
"""For each variable, generate aggregated timeseries product and add to file_collection."""

for var in self.product_variables:
# Filter input_list to the files relevant for this var
Expand Down Expand Up @@ -232,6 +237,31 @@ def _make_aggregated_timeseries(self):

self._cleanup_previous_version(product_file.name)

ocehugo marked this conversation as resolved.
Show resolved Hide resolved
def _make_hourly_timeseries(self):
"""Generate hourly products for the site and add to file_collection."""

# Filter input_list to the files relevant for this var
input_list = [f.local_path for f in self.input_file_collection]
self.logger.info("Creating hourly products from {n} input files".format(n=len(input_list)))

for qc_flags in self.product_qc_flags:

product_url, errors = hourly_aggregator(input_list, self.product_site_code, qc_flags, self.temp_dir)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I assume product_url is different given the qc_flags arguments!?


if errors:
self.logger.warning("{n} files were excluded from the aggregation.".format(n=len(errors)))
for f, e in errors.items():
if f not in self.excluded_files:
self.excluded_files[f] = set(e)
else:
self.excluded_files[f].update(e)

product_file = PipelineFile(product_url, file_update_callback=self._file_update_callback)
product_file.publish_type = PipelineFilePublishType.HARVEST_UPLOAD
self.file_collection.add(product_file)

self._cleanup_previous_version(product_file.name)

def _cleanup_previous_version(self, product_name):
"""Delete any previously published version(s) of the given product file.
Ignores cases where the previous version has exactly the same file name, as this will simply be overwritten.
Expand Down Expand Up @@ -265,6 +295,7 @@ def preprocess(self):
# TODO: Run compliance checks and remove non-compliant files from the input list (log them).

self._make_aggregated_timeseries()
self._make_hourly_timeseries()

# TODO: Include the list of excluded files as another table in the notification email (instead of the log)
if self.excluded_files:
Expand Down
23 changes: 18 additions & 5 deletions test_aodndata/moorings/test_mooringsProductsHandler.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from aodncore.pipeline.storage import get_storage_broker
from aodncore.testlib import HandlerTestCase, make_test_file

from aodndata.moorings.products_handler import MooringsProductsHandler, MooringsProductClassifier
from aodndata.moorings.products_handler import MooringsProductsHandler, MooringsProductClassifier, get_product_type

TEST_ROOT = os.path.dirname(__file__)
GOOD_MANIFEST = os.path.join(TEST_ROOT, 'test_product.json_manifest')
Expand Down Expand Up @@ -54,19 +54,32 @@ def test_good_manifest(self, mock_webfeatureservice):
self.assertCountEqual(INPUT_FILE_COLLECTION.get_attribute_list('dest_path'),
handler.input_file_collection.get_attribute_list('dest_path')
)
self.assertEqual(len(handler.file_collection), 5)

self.assertEqual(len(handler.file_collection), 7)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why the 7 hardcoding? If it's because this class is only executed for a single test, them grab the number of file_collection as args.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I also think the test_good_manifest is doing too much stuff - maybe some function refactoring !?


# check new product files
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this part can be a test function

expected_new_products = {'TEMP-aggregated-timeseries',
'PSAL-aggregated-timeseries',
'CHLF-aggregated-timeseries',
'hourly-timeseries',
'hourly-timeseries-including-non-QC'
mhidas marked this conversation as resolved.
Show resolved Hide resolved
}
published_files = handler.file_collection.filter_by_attribute_id('publish_type',
PipelineFilePublishType.HARVEST_UPLOAD)
self.assertEqual(len(published_files), 3)
self.assertEqual(len(published_files), len(expected_new_products))
for f in published_files:
self.assertTrue(f.is_harvested and f.is_stored)
published_products = {get_product_type(f.name) for f in published_files}
self.assertSetEqual(published_products, expected_new_products)

# check deletion of previous versions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this part can also be a separate one

expected_deleted_products = {'TEMP-aggregated-timeseries', 'PSAL-aggregated-timeseries'}
deleted_files = handler.file_collection.filter_by_attribute_id('publish_type',
PipelineFilePublishType.DELETE_UNHARVEST)
self.assertEqual(len(deleted_files), 2)
self.assertEqual(len(deleted_files), len(expected_deleted_products))
for f in deleted_files:
self.assertTrue(f.is_harvested and f.is_stored)
deleted_products = {get_product_type(f.name) for f in deleted_files}
self.assertSetEqual(deleted_products, expected_deleted_products)

self.assertEqual(len(handler.excluded_files), 1)

Expand Down