diff --git a/.gitignore b/.gitignore index 11fe633..68e1461 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,7 @@ logfile *.html *.csv *.png +*.db pecos/tests/*.csv pecos/tests/*.png diff --git a/.travis.yml b/.travis.yml index f3366b1..9371193 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,15 +9,23 @@ sudo: false # if false, use TravisCI's container based build matrix: include: - python: 3.6 + dist: xenial env: CONDA_ENV=py36 services: - xvfb + sudo: true - python: 3.7 dist: xenial services: - xvfb sudo: true env: CONDA_ENV=py37 + - python: 3.8 + dist: xenial + env: CONDA_ENV=py38 + services: + - xvfb + sudo: true addons: apt: diff --git a/LICENSE.txt b/LICENSE.txt index 813c566..4be131a 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1 +1 @@ -License Notice ============== Copyright 2016-2020 National Technology & Engineering Solutions of Sandia, LLC (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the U.S. Government retains certain rights in this software. This software is distributed under the Revised BSD License. Pecos also leverages a variety of third-party software packages, which have separate licensing policies. Revised BSD License ------------------- Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Sandia National Laboratories, nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file +License Notice ============== Copyright 2016 National Technology & Engineering Solutions of Sandia, LLC (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the U.S. Government retains certain rights in this software. This software is distributed under the Revised BSD License. Pecos also leverages a variety of third-party software packages, which have separate licensing policies. Revised BSD License ------------------- Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Sandia National Laboratories, nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/README.md b/README.md index 52c2c0f..3fad313 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ [![TravisCI](https://travis-ci.org/sandialabs/pecos.svg?branch=master)](https://travis-ci.org/sandialabs/pecos) [![Coverage Status](https://coveralls.io/repos/github/sandialabs/pecos/badge.svg?branch=master)](https://coveralls.io/github/sandialabs/pecos?branch=master) -[![Documentation Status](https://readthedocs.org/projects/pecos/badge/?version=latest)](http://pecos.readthedocs.org/en/latest/) +[![Documentation Status](https://readthedocs.org/projects/pecos/badge/?version=stable)](https://pecos.readthedocs.io/en/stable/?badge=stable) [![Downloads](https://pepy.tech/badge/pecos)](https://pepy.tech/project/pecos) Advances in sensor technology have rapidly increased our ability to monitor diff --git a/ci/requirements-py36.yml b/ci/requirements-py36.yml index aaf62fd..595faa2 100644 --- a/ci/requirements-py36.yml +++ b/ci/requirements-py36.yml @@ -4,14 +4,15 @@ channels: - http://conda.anaconda.org/pvlib dependencies: - python=3.6 - - numpy - pandas + - numpy + - jinja2 + - matplotlib - nose - pvlib - - matplotlib - plotly - ephem - - jinja2 - xlrd + - sqlalchemy - pip: - coveralls \ No newline at end of file diff --git a/ci/requirements-py37.yml b/ci/requirements-py37.yml index d1d8e29..34d804a 100644 --- a/ci/requirements-py37.yml +++ b/ci/requirements-py37.yml @@ -4,14 +4,15 @@ channels: - http://conda.anaconda.org/pvlib dependencies: - python=3.7 - - numpy - pandas + - numpy + - jinja2 + - matplotlib - nose - pvlib - - matplotlib - plotly - ephem - - jinja2 - xlrd + - sqlalchemy - pip: - coveralls \ No newline at end of file diff --git a/ci/requirements-py38.yml b/ci/requirements-py38.yml new file mode 100644 index 0000000..6859532 --- /dev/null +++ b/ci/requirements-py38.yml @@ -0,0 +1,18 @@ +name: test_env +channels: + - defaults + - http://conda.anaconda.org/pvlib +dependencies: + - python=3.8 + - pandas + - numpy + - jinja2 + - matplotlib + - nose + - pvlib + - plotly + - ephem + - xlrd + - sqlalchemy + - pip: + - coveralls \ No newline at end of file diff --git a/documentation/applications.rst b/documentation/applications.rst index a120ef1..fb6fb7e 100644 --- a/documentation/applications.rst +++ b/documentation/applications.rst @@ -1,40 +1,37 @@ Custom applications ==================== -Pecos can be customized for specific applications. Python scripts can be added -to initialize data and add application specific models. Additional quality control tests -can be added by inheriting from the PerformanceMonitoring class. +While Pecos was initially developed to monitor photovoltaic systems, it is designed to be used for a wide range of applications. The ability to run the analysis within the Python environment enables the use of diverse analysis options that can be incorporated into Pecos, including application specific models. The software has been used to monitor energy systems in support of several Department of Energy projects, as described below. -PV system monitoring +Photovoltaic systems --------------------- -Pecos was originally developed to monitor photovoltaic (PV) systems as part of the + +Pecos was originally developed at Sandia National Laboratories in 2016 to monitor photovoltaic (PV) systems as part of the `Department of Energy Regional Test Centers `_. Pecos is used to run daily analysis on data collected at several sites across the US. For PV systems, the translation dictionary can be used to group data according to the system architecture, which can include multiple strings and modules. The time filter can be defined based on sun position and system location. -The data objects used in Pecos are compatible with PVLIB, which can be used to model PV -systems [SHFH16]_ (http://pvlib-python.readthedocs.io/). +The data objects used in Pecos are compatible with `PVLIB `_, which can be used to model PV +systems [SHFH16]_. Pecos also includes functions to compute PV specific metrics (i.e. insolation, performance ratio, clearness index) in the :class:`~pecos.pv` module. The International Electrotechnical Commission (IEC) has developed guidance to measure and analyze energy production from PV systems. -[KlSC17]_ describes an application of the standards outlined in IEC 61724-3, using +Klise et al. [KlSC17]_ describe an application of IEC 61724-3, using Pecos and PVLIB. Pecos includes a PV system example in the `examples/pv `_ directory. -MRE quality control analysis +Marine renewable energy systems -------------------------------- -Pecos is also being used to perform quality control analysis on data collected from -marine renewable energy (MRE) systems including wave, tidal, and river -systems. -This effort is part of the `Marine and Hydrokinetic Toolkit (MHKiT) `_. - -Performance metrics ---------------------- -Pecos is typically used to run a series of quality control tests on data collected -over a set time interval (i.e. hourly, daily, weekly). -The metrics that are generated from each analysis can be used in additional -quality control analysis to track long term performance and system health (i.e. yearly summary reports). -Pecos includes a performance metrics example (based on one year of PV metrics) -in the `examples/metrics `_ directory. + +In partnership with National Renewable Energy Laboratory (NREL) and Pacific Northwest National Laboratory (PNNL), Pecos was integrated into the `Marine and Hydrokinetic Toolkit (MHKiT) `_ to support research funded by the Department of Energy’s Water Power Technologies Office. MHKiT provides provides the marine renewable energy (MRE) community with tools for data quality control, resource assessment, and device performance which adhere to the International Electrotechnical Commission (IEC) Technical Committee’s, IEC TC 114. Pecos provides a quality control analysis on data collected from +MRE systems including wave, tidal, and river systems. + +Fossil energy systems +----------------------- + +In partnership with National Energy Technology Laboratory (NETL), Pecos was extended to demonstrate real-time monitoring of coal-fired power plants in support of the Department of Energy's `Institute for the Design of Advanced Energy Systems (IDAES) `_. +As part of this demonstration, streaming algorithms were added to Pecos to facilitate near real-time analysis using continuous data streams. + + diff --git a/documentation/automation.rst b/documentation/automation.rst new file mode 100644 index 0000000..a294251 --- /dev/null +++ b/documentation/automation.rst @@ -0,0 +1,211 @@ +Automation +============= + +Task scheduler +------------------ + +To run Pecos on an automated schedule, create a task using your operating systems. +On Windows, open the Control Panel and search for *Schedule Tasks*. +On Linux and OSX, use the *cron* utility. + +Tasks are defined by a trigger and an action. +The trigger indicates when the task should be run (i.e. Daily at 1:00 pm). +The action can be set to run a batch file. +A batch file (.bat or .cmd filename extension) can be easily +written to start a Python script which runs Pecos. +For example, the following batch file runs driver.py:: + + cd your_working_directory + C:\Users\username\Anaconda3\python.exe driver.py + +.. _continuous: + +Continuous analysis +------------------------ + +The following example illustrates a framework that analyzes continuous streaming data and provides reports. +For continuous data streams, it is often advantageous to provide quality control analysis and reports at a regular interval. While the analysis and reporting can occur every time new data is available, it is often more informative and more efficient to run analysis and create reports that cover a longer time interval. For example, data might be collected every minute and quality control analysis might be run every day. + +The following example pulls data from an SQL database that includes a table of raw data (data), table of data that has completed quality control analysis (qc_data), and a table that stores a summary of quality control test failures (qc_summary). +After the analysis, quality control results are appended to the database. This process could also include metrics that describe the quality control results. +The following code could be used as a Python driver that runs using a task scheduler every day, pulling in yesterday's data. In this example, 1 hour of cleaned data is used to initialize the moving window and a streaming outlier test is run. + +.. doctest:: + :hide: + + >>> import pandas as pd + >>> from sqlalchemy import create_engine + >>> from sqlalchemy.types import DateTime, Date, Time, Float + >>> import datetime + >>> import numpy as np + >>> import os + + >>> try: os.remove('monitor.db') + ... except: pass + + >>> engine = create_engine('sqlite:///monitor.db', echo=False) + >>> date = datetime.date.today()-datetime.timedelta(days=1) + >>> N = 24*60 + >>> index = pd.date_range(date, periods=N, freq='Min') + >>> df1 = {'A': np.random.normal(size=N),'B': np.random.normal(size=N)} + >>> df1 = pd.DataFrame(df1, index=index) + >>> df1.index.name = 'timestamp' + >>> df1.to_sql('data', engine, dtype={'timestamp': DateTime(), 'A': Float(), 'B': Float()}) + + >>> index = pd.date_range(date-datetime.timedelta(days=1), periods=N, freq='Min') + >>> df2 = {'A': np.random.normal(size=N),'B': np.random.normal(size=N)} + >>> df2 = pd.DataFrame(df2, index=index) + >>> df2.index.name = 'timestamp' + >>> df2.to_sql('qc_data', engine, dtype={'timestamp': DateTime(), 'A': Float(), 'B': Float()}) + + >>> #data1 = engine.execute("SELECT * FROM data").fetchall() + >>> #history1 = engine.execute("SELECT * FROM qc_data").fetchall() + +.. doctest:: + + >>> import pandas as pd + >>> from sqlalchemy import create_engine + >>> import datetime + >>> import pecos + + >>> # Create the SQLite engine + >>> engine = create_engine('sqlite:///monitor.db', echo=False) + + >>> # Define the date to extract yesterday's data + >>> date = datetime.date.today()-datetime.timedelta(days=1) + + >>> # Load data and recent history from the database + >>> data = pd.read_sql("SELECT * FROM data WHERE timestamp BETWEEN '" + str(date) + \ + ... " 00:00:00' AND '" + str(date) + " 23:59:59';" , engine, + ... parse_dates='timestamp', index_col='timestamp') + + >>> history = pd.read_sql("SELECT * FROM qc_data WHERE timestamp BETWEEN '" + \ + ... str(date-datetime.timedelta(days=1)) + " 23:00:00' AND '" + \ + ... str(date-datetime.timedelta(days=1)) + " 23:59:59';" , engine, + ... parse_dates='timestamp', index_col='timestamp') + + >>> # Setup the PerformanceMonitoring with data and history and run a streaming outlier test + >>> pm = pecos.monitoring.PerformanceMonitoring() + >>> pm.add_dataframe(data) + >>> pm.add_dataframe(history) + >>> pm.check_outlier([-3, 3], window=3600, streaming=True) + + >>> # Save the cleaned data and test results to the database + >>> pm.cleaned_data.to_sql('qc_data', engine, if_exists='append') + >>> pm.test_results.to_sql('qc_summary', engine, if_exists='append') + + >>> # Create a monitoring report with test results and graphics + >>> test_results_graphics = pecos.graphics.plot_test_results(data, pm.test_results) + >>> filename = pecos.io.write_monitoring_report(pm.data, pm.test_results, test_results_graphics, + ... filename='monitoring_report_'+str(date)+'.html') + +Configuration file +------------------------ + +A configuration file can be used to store information about the system, data, and +quality control tests. **The configuration file is not used directly within Pecos, +therefore there are no specific formatting requirements.** +Configuration files can be useful when using the same Python script +to analyze several systems that have slightly different input requirements. + +The `examples/simple `_ directory includes a configuration file, **simple_config.yml**, that defines +system specifications, +translation dictionary, +composite signals, +corrupt values, +and bounds for range and increment tests. +The script, **simple_example_using_config.py** uses this +configuration file to run the simple example. + +.. literalinclude:: ../examples/simple/simple_config.yml + +For some use cases, it is convenient to use strings of Python code in +a configuration file to define time filters, +quality control bounds, and composite signals. +These strings can be evaluated using :class:`~pecos.utils.evaluate_string`. +**WARNING this function calls** +``eval`` +**. Strings of Python code should be thoroughly tested by the user.** + +For each {keyword} in the string, {keyword} is expanded in the following order: + +* If keyword is ELAPSED_TIME, CLOCK_TIME or EPOCH_TIME then data.index is + converted to seconds (elapsed time, clock time, or epoch time) is used in the evaluation +* If keyword is used to select a column (or columns) of data, then data[keyword] + is used in the evaluation +* If a translation dictionary is used to select a column (or columns) of data, then + data[trans[keyword]] is used in the evaluation +* If the keyword is a key in a dictionary of constants (specs), then + specs[keyword] is used in the evaluation + +For example, the time filter string is evaluated below. + +.. doctest:: + :hide: + + >>> import pandas as pd + >>> import numpy as np + >>> import pecos + >>> index = pd.date_range('1/1/2015', periods=96, freq='15Min') + >>> data = {'A': np.random.rand(96), 'B': np.random.rand(96)} + >>> df = pd.DataFrame(data, index=index) + +.. doctest:: + + >>> string_to_eval = "({CLOCK_TIME} > 3*3600) & ({CLOCK_TIME} < 21*3600)" + >>> time_filter = pecos.utils.evaluate_string(string_to_eval, df) + +.. _devicetoclient_config: + +Data acquisition +-------------------- + +Pecos includes basic data acquisition methods to transfer data from sensors to an SQL database. +These methods require the Python packages +sqlalchemy (https://www.sqlalchemy.org/) and +minimalmodbus (https://minimalmodbus.readthedocs.io). + +The :class:`~pecos.io.device_to_client` method collects data from a modbus device and stores it in a local +MySQL database. +The method requires several configuration options, which are stored as a nested dictionary. +pyyaml can be used to store configuration options in a file. +The options are stored in a **Client** block and a **Devices** block. +The Devices block can define multiple devices and each device can have multiple data streams. +The configuration options are described below. + +* **Client**: A dictionary that contains information about the client. + The dictionary has the following keys: + + * **IP**: IP address (string) + * **Database**: name of database (string) + * **Table**: name of table (string) + * **Username**: name of user (string) + * **Password**: password for user (string) + * **Interval**: data collection frequency in seconds (integer) + * **Retries**: number of retries for each channel (integer) + +* **Devices**: A list of dictionaries that contain information about each device (one dictionary per device). + Each dictionary has the following keys: + + * **Name**: modbus device name (string) + * **USB**: serial connection (string) e.g. /dev/ttyUSB0 for linux + * **Address**: modbus slave address (string) + * **Baud**: data transfer rate in bits per second (integer) + * **Parity**: parity of transmitted data for error checking (string). Possible values: N, E, O + * **Bytes**: number of data bits (integer) + * **Stopbits**: number of stop bits (integer) + * **Timeout**: read timeout value in seconds (integer) + * **Data**: A list of dictionaries that contain information about each data stream (one dictionary per data stream). + Each dictionary has the following keys: + + * **Name**: data name (string) + * **Type**: data type (string) + * **Scale**: scaling factor (integer) + * **Conversion**: conversion factor (float) + * **Channel**: register number (integer) + * **Signed**: define data as unsigned or signed (bool) + * **Fcode**: modbus function code (integer). Possible values: 3,4 + +Example configuration options are shown below. + +.. literalinclude:: ../pecos/templates/device_to_client.yml diff --git a/documentation/composite_signal.rst b/documentation/composite_signal.rst index 24bc680..d2dbfff 100644 --- a/documentation/composite_signal.rst +++ b/documentation/composite_signal.rst @@ -43,7 +43,7 @@ the PerformanceMonitoring object. .. doctest:: - >>> clocktime = pecos.utils.datetime_to_clocktime(pm.df.index) + >>> clocktime = pecos.utils.datetime_to_clocktime(pm.data.index) >>> wave_model = pd.DataFrame(np.sin(10*(clocktime/86400)), - ... index=pm.df.index, columns=['Wave Model']) + ... index=pm.data.index, columns=['Wave Model']) >>> pm.add_dataframe(wave_model) diff --git a/documentation/conf.py b/documentation/conf.py index d79b3f4..2b9a8ef 100644 --- a/documentation/conf.py +++ b/documentation/conf.py @@ -67,7 +67,7 @@ # General information about the project. project = u'Pecos' -copyright = u'2016-2020, National Technology & Engineering Solutions of Sandia, LLC (NTESS)' +copyright = u'2016, National Technology & Engineering Solutions of Sandia, LLC (NTESS)' author = u'Sandia National Laboratories' # The version info for the project you're documenting, acts as replacement for diff --git a/documentation/configfile.rst b/documentation/configfile.rst deleted file mode 100644 index bd34de1..0000000 --- a/documentation/configfile.rst +++ /dev/null @@ -1,54 +0,0 @@ -Configuration file -================== - -A configuration file can be used to store information about the system, data, and -quality control tests. **The configuration file is not used directly within Pecos, -therefore there are no specific formatting requirements.** -Configuration files can be useful when using the same Python script -to analyze several systems that have slightly different input requirements. - -The `examples/simple `_ directory includes a configuration file, **simple_config.yml**, that defines -system specifications, -translation dictionary, -composite signals, -corrupt values, -and bounds for range and increment tests. -The script, **simple_example_using_config.py** uses this -configuration file to run the simple example. - -.. literalinclude:: ../examples/simple/simple_config.yml - -For some use cases, it is convenient to use strings of Python code in -a configuration file to define time filters, -quality control bounds, and composite signals. -These strings can be evaluated using :class:`~pecos.utils.evaluate_string`. -**WARNING this function calls ``eval``. Strings of Python code should be -thoroughly tested by the user.** - -For each {keyword} in the string, {keyword} is expanded in the following order: - -* If keyword is ELAPSED_TIME, CLOCK_TIME or EPOCH_TIME then data.index is - converted to seconds (elapsed time, clock time, or epoch time) is used in the evaluation -* If keyword is used to select a column (or columns) of data, then data[keyword] - is used in the evaluation -* If a translation dictionary is used to select a column (or columns) of data, then - data[trans[keyword]] is used in the evaluation -* If the keyword is a key in a dictionary of constants (specs), then - specs[keyword] is used in the evaluation - -For example, the time filter string is evaluated below. - -.. doctest:: - :hide: - - >>> import pandas as pd - >>> import numpy as np - >>> import pecos - >>> index = pd.date_range('1/1/2015', periods=96, freq='15Min') - >>> data = {'A': np.random.rand(96), 'B': np.random.rand(96)} - >>> df = pd.DataFrame(data, index=index) - -.. doctest:: - - >>> string_to_eval = "({CLOCK_TIME} > 3*3600) & ({CLOCK_TIME} < 21*3600)" - >>> time_filter = pecos.utils.evaluate_string(string_to_eval, df) diff --git a/documentation/daq.rst b/documentation/daq.rst deleted file mode 100644 index a05f010..0000000 --- a/documentation/daq.rst +++ /dev/null @@ -1,57 +0,0 @@ -Data acquisition -================== - -Pecos includes basic data acquisition methods to transfer data from sensors to an SQL database. -These methods require the Python packages -sqlalchemy (https://www.sqlalchemy.org/) and -minimalmodbus (https://minimalmodbus.readthedocs.io). - -.. _devicetoclient_config: - -Device to client -------------------- - -The :class:`~pecos.io.device_to_client` method collects data from a modbus device and stores it in a local -MySQL database. -The method requires several configuration options, which are stored as a nested dictionary. -pyyaml can be used to store configuration options in a file. -The options are stored in a **Client** block and a **Devices** block. -The Devices block can define multiple devices and each device can have multiple data streams. -The configuration options are described below. - -* **Client**: A dictionary that contains information about the client. - The dictionary has the following keys: - - * **IP**: IP address (string) - * **Database**: name of database (string) - * **Table**: name of table (string) - * **Username**: name of user (string) - * **Password**: password for user (string) - * **Interval**: data collection frequency in seconds (integer) - * **Retries**: number of retries for each channel (integer) - -* **Devices**: A list of dictionaries that contain information about each device (one dictionary per device). - Each dictionary has the following keys: - - * **Name**: modbus device name (string) - * **USB**: serial connection (string) e.g. /dev/ttyUSB0 for linux - * **Address**: modbus slave address (string) - * **Baud**: data transfer rate in bits per second (integer) - * **Parity**: parity of transmitted data for error checking (string). Possible values: N, E, O - * **Bytes**: number of data bits (integer) - * **Stopbits**: number of stop bits (integer) - * **Timeout**: read timeout value in seconds (integer) - * **Data**: A list of dictionaries that contain information about each data stream (one dictionary per data stream). - Each dictionary has the following keys: - - * **Name**: data name (string) - * **Type**: data type (string) - * **Scale**: scaling factor (integer) - * **Conversion**: conversion factor (float) - * **Channel**: register number (integer) - * **Signed**: define data as unsigned or signed (bool) - * **Fcode**: modbus function code (integer). Possible values: 3,4 - -Example configuration options are shown below. - -.. literalinclude:: ../pecos/templates/device_to_client.yml diff --git a/documentation/figures/monitoring_report.png b/documentation/figures/monitoring_report.png index 230ea02..c687688 100644 Binary files a/documentation/figures/monitoring_report.png and b/documentation/figures/monitoring_report.png differ diff --git a/documentation/framework.rst b/documentation/framework.rst index ca020cf..ca2d060 100644 --- a/documentation/framework.rst +++ b/documentation/framework.rst @@ -12,7 +12,7 @@ Pecos contains the following modules ======================================= ============================================================================================================================================= :class:`~pecos.monitoring` Contains the PerformanceMonitoring class and individual quality control test functions that are used to run analysis :class:`~pecos.metrics` Contains metrics that describe the quality control analysis or compute quantities that might be of use in the analysis - :class:`~pecos.io` Contains functions to load data, send email alerts, write results to files, and generate html reports + :class:`~pecos.io` Contains functions to load data, send email alerts, write results to files, and generate HTML and LaTeX reports :class:`~pecos.graphics` Contains functions to generate scatter, time series, and heatmap plots for reports :class:`~pecos.utils` Contains helper functions, including functions to convert time series indices from seconds to datetime ======================================= ============================================================================================================================================= @@ -20,9 +20,12 @@ Pecos contains the following modules In addition to the modules listed above, Pecos also includes a :class:`~pecos.pv` module that contains metrics specific to photovoltaic analysis. -Object-oriented approach -------------------------- +Object-oriented and functional approach +----------------------------------------- +Pecos supports quality control tests that are called using both an object-oriented and functional approach. +Object-oriented approach +^^^^^^^^^^^^^^^^^^^^^^^^^^^ Pecos includes a :class:`~pecos.monitoring.PerformanceMonitoring` class which is the base class used to define the quality control analysis. This class stores: @@ -30,16 +33,18 @@ the quality control analysis. This class stores: * Translation dictionary (maps raw data column names to common names) * Time filter (excludes specific timestamps from analysis) -The class can be used to call quality control tests, including: +The class is used to call quality control tests, including: -* Check timestamps for missing, duplicate, and non-monotonic indexes -* Check for missing data -* Check for corupt data -* Check for data outside expected range -* Check for stagnant of abrupt changes in the data -* Check for outliers +* :class:`~pecos.monitoring.PerformanceMonitoring.check_timestamp`: Check timestamps for missing, duplicate, and non-monotonic indexes +* :class:`~pecos.monitoring.PerformanceMonitoring.check_missing`: Check for missing data +* :class:`~pecos.monitoring.PerformanceMonitoring.check_corrupt`: Check for corrupt data +* :class:`~pecos.monitoring.PerformanceMonitoring.check_range`: Check for data outside expected range +* :class:`~pecos.monitoring.PerformanceMonitoring.check_delta`: Check for stagnant of abrupt changes in the data +* :class:`~pecos.monitoring.PerformanceMonitoring.check_outlier`: Check for outliers +* :class:`~pecos.monitoring.PerformanceMonitoring.check_custom_static`: Custom static quality control test +* :class:`~pecos.monitoring.PerformanceMonitoring.check_custom_streaming`: Custom streaming quality control test -The class can be used to return: +The class can return the following results: * Cleaned data (data that failed a test is replaced by NaN) * Boolean mask (indicates if data failed a test) @@ -78,7 +83,7 @@ These properties are updated each time a quality control test is run. >>> test_results = pm.test_results Functional approach --------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^ The same quality control tests can also be run using individual functions. These functions generate a PerformanceMonitoring object under the hood and return: @@ -104,4 +109,54 @@ from a resulting dictionary. >>> mask = results['mask'] >>> test_results = results['test_results'] -Note, examples in the documentation use the object-oriented approach. \ No newline at end of file +Note, examples in the documentation use the object-oriented approach. + +.. _static_streaming: + +Static and streaming analysis +------------------------------------ +Pecos supports both static and streaming analysis. + +Static analysis +^^^^^^^^^^^^^^^^^^^^^^^ +Most quality control tests in Pecos use static analysis. +Static analysis operates on the entire data set to determine if all data points are normal or anomalous. While this can include operations like moving window statistics, the quality control tests operates on the entire data set at once. +This means that results from the quality control test are not dependent on results from a previous time step. +This approach is appropriate when data at different time steps can be analyzed independently, or moving window statistics used to analyze the data do not need to be updated based on test results. + +The following quality control tests use static analysis: + +* :class:`~pecos.monitoring.PerformanceMonitoring.check_missing` +* :class:`~pecos.monitoring.PerformanceMonitoring.check_corrupt` +* :class:`~pecos.monitoring.PerformanceMonitoring.check_range` +* :class:`~pecos.monitoring.PerformanceMonitoring.check_delta` +* :class:`~pecos.monitoring.PerformanceMonitoring.check_outlier` :superscript:`1` +* :class:`~pecos.monitoring.PerformanceMonitoring.check_custom_static` + +:superscript:`1` The outlier test can make use of both static and streaming analysis. See :ref:`outlier` for more details. + +Streaming analysis +^^^^^^^^^^^^^^^^^^^^^^^ +The streaming analysis loops through each data point using a quality control tests that relies on information from "clean data" in a moving window. If a data point is determined to be anomalous, it is not included in the window for subsequent analysis. +When using a streaming analysis, Pecos keeps track of the cleaned history that is used in the quality control test at each time step. +This approach is important to use when the underlying methods in the quality control test could be corrupted by historical data points that were deemed anomalous. The streaming analysis also allows users to better analyze continuous datasets in a near real-time fashion. While Pecos could be used to analyze data at a single time step in a real-time fashion (creating a new instance of the PerformanceMonitoring class each time), the methods in Pecos are really designed to analyze data over a time period. That time period can depend on several factors, including the size of the data and how often the test results and reports should be generated. Cleaned history can be appended to new datasets as they come available to create a seamless analysis for continuous data. See :ref:`continuous` for more details. + +The streaming analysis includes an optional parameter which is used to **rebase data in the history window** if a certain fraction of that data has been deemed to be anomalous. The ability to rebase the history is useful if data changes to a new normal condition that would otherwise continue to be flagged as anomalous. + +The following quality control tests use streaming analysis: + +* :class:`~pecos.monitoring.PerformanceMonitoring.check_timestamp` :superscript:`2` +* :class:`~pecos.monitoring.PerformanceMonitoring.check_outlier` :superscript:`3` +* :class:`~pecos.monitoring.PerformanceMonitoring.check_custom_streaming` + +:superscript:`2` The timestamp test does not loop through data using a moving window, rather timestamp functionality in Pandas is used to determine anomalies in the time index. + +:superscript:`3` The outlier test can make use of both static and streaming analysis. See :ref:`outlier` for more details. + +Custom quality control tests +--------------------------------- +Pecos supports custom quality control tests that can be static or streaming in form. This feature allows the user to customize the analysis used to determine if data is anomalous and return custom metadata from the analysis. + +The custom function is defined outside of Pecos and handed to the custom quality control method as an input argument. The allows the user to include analysis options that are not currently support in Pecos or are very specific to their application. + +While there are no specifications on the information that metadata stores, the metadata commonly includes raw values that were used in the quality control test. For example, while the outlier test returns a boolean value that indicates if data is normal or anomalous, the metadata can include the normalized data value that was used to make that determination. See :ref:`custom` for more details. diff --git a/documentation/index.rst b/documentation/index.rst index 343116e..fc39879 100644 --- a/documentation/index.rst +++ b/documentation/index.rst @@ -50,11 +50,9 @@ Contents qc_tests metrics composite_signal - configfile - scheduler results + automation applications - daq license whatsnew developers diff --git a/documentation/index_latex.rst b/documentation/index_latex.rst deleted file mode 100644 index ce5ea7f..0000000 --- a/documentation/index_latex.rst +++ /dev/null @@ -1,32 +0,0 @@ -Performance Monitoring using Pecos -================================================================ - -.. toctree:: - :maxdepth: 1 - - acknowledgements - overview - installation - example - timeseries - translation - timefilter - qc_tests - metrics - composite_signal - configfile - scheduler - results - applications - daq - license - reference - -Indices and tables -================== - -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` - - diff --git a/documentation/installation.rst b/documentation/installation.rst index e8ad3e2..18e8493 100644 --- a/documentation/installation.rst +++ b/documentation/installation.rst @@ -1,7 +1,7 @@ Installation ====================================== -Pecos requires Python (tested on 3.6 and 3.7) along with several Python +Pecos requires Python (tested on 3.6, 3.7, and 3.8) along with several Python package dependencies. Information on installing and using Python can be found at https://www.python.org/. Python distributions, such as Anaconda, are recommended to manage the Python interface. @@ -28,7 +28,7 @@ The software can then be installed by unzipping the file and running setup.py:: unzip pecos-master.zip cd pecos-master python setup.py install - + Required Python package dependencies include: * Pandas [Mcki13]_: used to analyze and store time series data, diff --git a/documentation/license.rst b/documentation/license.rst index 1e140b8..59da5c7 100644 --- a/documentation/license.rst +++ b/documentation/license.rst @@ -10,7 +10,7 @@ Copyright ------------ .. code-block:: none - Copyright 2016-2020 National Technology & Engineering Solutions of Sandia, + Copyright 2016 National Technology & Engineering Solutions of Sandia, LLC (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the U.S. Government retains certain rights in this software. diff --git a/documentation/metrics.rst b/documentation/metrics.rst index 81ac6b5..94ff732 100644 --- a/documentation/metrics.rst +++ b/documentation/metrics.rst @@ -6,6 +6,10 @@ compute quantities that might be of use in the analysis. Many of these metrics aggregates over time and can be saved to track long term performance and system health. +While Pecos typically runs a series of quality control tests on raw data, quality control tests can also be run on metrics generated from these analyses to track long term performance and system health. For example, daily quality control analysis can generate summary metrics that can later be used to generate a yearly summary report. +Pecos includes a performance metrics example (based on one year of PV metrics) +in the `examples/metrics `_ directory. + Quality control index ------------------------- The quality control index (QCI) is a general metric which indicates the diff --git a/documentation/overview.rst b/documentation/overview.rst index bd9b5c3..fe6c6f0 100644 --- a/documentation/overview.rst +++ b/documentation/overview.rst @@ -9,16 +9,18 @@ and generate reports which include performance metrics, test results, and graphi The software can be customized for specific applications. Some high-level features include: -* Pecos uses Pandas DataFrames [Mcki13]_ for time series analysis. This dependency +* Pecos uses Pandas DataFrames [Mcki13]_ to store and analyze time series data. This dependency facilitates a wide range of analysis options and date-time functionality. -* Data columns can be easily reassigned to common names through the use of a +* Data column names can be easily reassigned to common names through the use of a translation dictionary. Translation dictionaries also allow data columns to be grouped for analysis. * Time filters can be used to eliminate data at specific times from quality control tests (i.e. early evening and late afternoon). +* Predefined and custom quality control functions can be used to determine if data is anomalous. + * Application specific models can be incorporated into quality control tests to compare measured to modeled data values. @@ -28,6 +30,6 @@ Some high-level features include: * Analysis can be set up to run on an automated schedule (i.e. Pecos can be run each day to analyze data collected on the previous day). -* HTML formatted reports can be sent via email or hosted on a website. +* HTML formatted reports can be sent via email or hosted on a website. LaTeX formatted reports can also be generated. * Data acquisition methods can be used to transfer data from sensors to an SQL database. \ No newline at end of file diff --git a/documentation/qc_tests.rst b/documentation/qc_tests.rst index 49dd40f..fd29903 100644 --- a/documentation/qc_tests.rst +++ b/documentation/qc_tests.rst @@ -1,10 +1,12 @@ +.. _quality_control: + Quality control tests ====================== Pecos includes several built in quality control tests. When a test fails, information is stored in a summary table. This information can be saved to a file, database, or included in reports. -Quality controls tests fall into seven categories: +Quality controls tests fall into eight categories: * Timestamp * Missing data @@ -13,6 +15,7 @@ Quality controls tests fall into seven categories: * Delta * Increment * Outlier +* Custom .. note:: Quality control tests can also be called using individual functions, see :ref:`software_framework` for more details. @@ -24,7 +27,7 @@ duplicate, and non-monotonic indexes. If a duplicate timestamp is found, Pecos If timestamps are not monotonic, the timestamps are reordered. For this reason, the timestamp should be corrected before other quality control tests are run. -**The timestamp test is the only test that modifies the data stored in pm.df.** +**The timestamp test is the only test that modifies the data stored in pm.data.** Input includes: * Expected frequency of the time series in seconds @@ -45,7 +48,7 @@ For example, >>> import pandas as pd >>> import pecos >>> pm = pecos.monitoring.PerformanceMonitoring() - >>> index = pd.date_range('1/1/2016', periods=3, freq='s') + >>> index = pd.date_range('1/1/2016', periods=3, freq='60s') >>> data = [[1,2,3],[4,5,6],[7,8,9]] >>> df = pd.DataFrame(data=data, index=index, columns=['A', 'B', 'C']) >>> pm.add_dataframe(df) @@ -129,9 +132,9 @@ Input includes: * Upper and lower bound -* Data column (default = None, which indicates that all columns are used) +* Size of the moving window used to compute the difference between the minimum and maximum -* Size of the moving window used to compute the difference between the minimum and maximum (default = 3600 seconds) +* Data column (default = None, which indicates that all columns are used) * Flag indicating if the test should only check for positive delta (the min occurs before the max) or negative delta (the max occurs before the min) (default = False) @@ -188,6 +191,8 @@ checks if increments are less than 0.0001 for 60 consecutive time steps. checks if increments decrease by more than 800 in a single time step. +.. _outlier: + Outlier test -------------------- The :class:`~pecos.monitoring.PerformanceMonitoring.check_outlier` method is used to check if normalized data @@ -199,12 +204,24 @@ Input includes: * Data column (default = None, which indicates that all columns are used) -* Size of the moving window used to normalize the data (default = 3600 seconds) +* Size of the moving window used to normalize the data (default = None). Note that when the window is set to None, the mean and standard deviation of the entire data set is used to normalize the data. * Flag indicating if the absolute value of the normalize data is used in the test (default = True) * Minimum number of consecutive failures for reporting (default = 1) +* Flag indicating if the outlier test should use streaming analysis (default=False). + +Note that using a streaming analysis is different than merely defining a moving window. +Streaming analysis omits anomalous values from subsequent normalization calculations, where as a static analysis with a moving window does not. + +In a static analysis, the mean and standard deviation used to normalize the data are computed +using a moving window (or using the entire data set if window=None) and upper and lower +bounds are used to determine if data points are anomalous. The results do not impact the +moving window statistics. In a streaming analysis, the mean and standard deviation are +computed using a moving window after each data points is determined to be normal or anomalous. +Data points that are determined to be anomalous are not used in the normalization. + For example, .. doctest:: @@ -212,3 +229,153 @@ For example, >>> pm.check_outlier([None, 3], window=12*3600) checks if the normalized data changes by more than 3 standard deviations within a 12 hour moving window. + +.. _custom: + +Custom tests +-------------- +The :class:`~pecos.monitoring.PerformanceMonitoring.check_custom_static` and :class:`~pecos.monitoring.PerformanceMonitoring.check_custom_streaming` methods +allow the user to supply a custom function that is used to determine if data is normal or anomalous. +See :ref:`static_streaming` for more details. + +This feature allows the user to customize the analysis and return custom metadata from the analysis. +The custom function is defined outside of Pecos and handed to the custom quality control method as an input argument. The allows the user to include analysis options that are not currently support in Pecos or are very specific to their application. +While there are no specifications on what this metadata stores, the metadata commonly includes the raw values that were included in a quality control test. For example, while the outlier test returns a boolean value that indicates if data is normal or anomalous, the metadata can include the normalized data value that was used to make that determination. + +The user can also create custom quality control tests by creating a class that inherits from the PerformanceMonitoring class. + +Custom static analysis +^^^^^^^^^^^^^^^^^^^^^^^^ + +Static analysis operates on the entire data set to determine if all data points are normal or anomalous. Input for custom static analysis includes: + +* Custom quality control function with the following general form:: + + def custom_static_function(data): + """ + Parameters + ---------- + data : pandas DataFrame + Data to be analyzed. + + Returns + -------- + mask : pandas DataFrame + Mask contains boolean values and is the same size as data. + True = data passed the quality control test, + False = data failed the quality control test. + + metadata : pandas DataFrame + Metadata stores additional information about the test and is returned by + ''check_custom_static''. Metadata is generally the same size as data. + """ + + # User defined custom algorithm + ... + + return mask, metadata + +* Data column (default = None, which indicates that all columns are used) +* Minimum number of consecutive failures for reporting (default = 1) +* Error message (default = None) + +Custom static analysis can be run using the following example. +The custom function below, ``sine_function``, determines if sin(data) is greater than 0.5 and returns the value of sin(data) as metadata. + +.. doctest:: + + >>> import numpy as np + + >>> def sine_function(data): + ... # Create metadata and mask using sin(data) + ... metadata = np.sin(data) + ... mask = metadata > 0.5 + ... return mask, metadata + + >>> metadata = pm.check_custom_static(sine_function) + + +Custom streaming analysis +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The streaming analysis loops through each data point using a quality control tests that relies on information from "clean data" in a moving window. Input for custom streaming analysis includes: + +* Custom quality control function with the following general form:: + + def custom_streaming_function(data_pt, history): + """ + Parameters + ---------- + data_pt : pandas Series + The current data point to be analyzed. + + history : pandas DataFrame + Historical data used in the analysis. The streaming analysis omits + data points that were previously flagged as anomalous in the history. + + Returns + -------- + mask : pandas Series + Mask contains boolean values (one value for each row in data_pt). + True = data passed the quality control test, + False = data failed the quality control test. + + metadata : pandas Series + Metadata stores additional information about the test for the current data point. + Metadata generally contains one value for row in data_pt. Metadata is + collected into a pandas DataFrame with one row per time index that was included + in the quality control test (omits the history window) and is returned + by ''check_custom_streaming''. + """ + + # User defined custom algorithm + ... + + return mask, metadata + +* Size of the moving window used to define the cleaned history. +* Indicator used to rebase the history window. If the user defined fraction of the history window has been deemed anomalous, then the history is reset using raw data. The ability to rebase the history is useful if data changes to a new normal condition that would otherwise continue to be flagged as anomalous. (default = None, which indicates that rebase is not used) +* Data column (default = None, which indicates that all columns are used) +* Error message (default = None) + +Custom streaming analysis can be run using the following example. +The custom function below, ``nearest_neighbor``, determines if the current data point is within 3 standard +deviations of data in a 10 minute history window. +In this case, metadata returns the distance from each column in the current data point to its nearest neighbor in the history. +This is similar to the multivariate nearest neighbor algorithm used in CANARY [HMKC07]_. + +.. doctest:: + :hide: + + >>> import pandas as pd + >>> import pecos + >>> pm = pecos.monitoring.PerformanceMonitoring() + >>> index = pd.date_range('1/1/2016', periods=100, freq='60s') + >>> data = np.random.normal(size=(100,3)) + >>> df = pd.DataFrame(data=data, index=index, columns=['A', 'B', 'C']) + >>> pm.add_dataframe(df) + +.. doctest:: + + >>> import numpy as np + >>> import pandas as pd + >>> from scipy.spatial.distance import cdist + + >>> def nearest_neighbor(data_pt, history): + ... # Normalize the current data point and history using the history window + ... zt = (data_pt - history.mean())/history.std() + ... z = (history - history.mean())/history.std() + ... # Compute the distance from the current data point to data in the history window + ... zt_reshape = zt.to_frame().T + ... dist = cdist(zt_reshape, z) + ... # Extract the minimum distance + ... min_dist = np.nanmin(dist) + ... # Extract the index for the min distance and the distance components + ... idx = np.nanargmin(dist) + ... metadata = z.loc[idx,:] - zt + ... # Determine if the min distance is less than 3, assign value (T/F) to the mask + ... mask = pd.Series(min_dist <= 3, index=data_pt.index) + ... return mask, metadata + + >>> metadata = pm.check_custom_streaming(nearest_neighbor, window=600) + diff --git a/documentation/reference.rst b/documentation/reference.rst index 93e1d5a..6cad714 100644 --- a/documentation/reference.rst +++ b/documentation/reference.rst @@ -12,6 +12,8 @@ References If the 6 digits match another citation, add a lower case letter (a, b, ...) +.. [HMKC07] Hart, D., McKenna, S.A., Klise, K., Cruz, V., & Wilson, M. (2007) Water quality event detection systems for drinking water contamination warning systems: Development testing and application of CANARY, World Environmental and Water Resources Congress (EWRI), Tampa, FL, May 15-19. + .. [Hunt07] Hunter, J.D. (2007). Matplotlib: A 2D graphics environment. Computing in Science & Engineering, 3(9), 90-95. .. [KlSt16a] Klise, K.A., Stein, J.S. (2016). Performance Monitoring using Pecos, Technical Report SAND2016-3583, Sandia National Laboratories. diff --git a/documentation/results.rst b/documentation/results.rst index f598e0e..5e41ecc 100644 --- a/documentation/results.rst +++ b/documentation/results.rst @@ -3,8 +3,10 @@ Results ========== -Pecos can be used to collect quality control test results and performance -metrics, and generate HTML reports and dashboards. +Analysis run using Pecos results in a collection of +quality control test results, +quality control mask, cleaned data, and performance +metrics. This information can be used to generate HTML/LaTeX reports and dashboards. Quality control test results ------------------------------ @@ -145,17 +147,18 @@ The metrics_file.csv file will contain:: Monitoring reports ------------------------------- -The :class:`~pecos.io.write_monitoring_report` method is used to generate a HTML monitoring report. -The monitoring report includes the start and end time for analysis, custom graphics +The :class:`~pecos.io.write_monitoring_report` method is used to generate a HTML or LaTeX formatted monitoring report. +The monitoring report includes the start and end time for the data that was analyzed, custom graphics and performance metrics, a table that includes test results, graphics associated with the test results (highlighting data points that failed a quality control tests), notes on runtime errors and warnings, and the configuration options used in the analysis. * **Custom Graphics:** - Custom graphics can be created for specific applications. These graphics - are included at the top of the report. - Custom graphics can be generated using methods in the :class:`~pecos.graphics` module. + Custom graphics are created by the user for their specific application. + Custom graphics can also be generated using methods in the :class:`~pecos.graphics` module. + These graphics are included at the top of the report. + * **Performance Metrics:** Performance metrics are displayed in a table. @@ -166,7 +169,7 @@ used in the analysis. Test results graphics are generated using the :class:`~pecos.graphics.plot_test_results` method. * **Notes:** - Notes include Pecos runtime errors and warnings. Notes include: + Notes include Pecos runtime errors and warnings. Notes include: * Empty/missing data * Formatting error in the translation dictionary @@ -206,7 +209,7 @@ For each row and column in the dashboard, the following information can be speci * Links (i.e. the path to a monitoring report or other file/site for additional information) -Text, graphics, tables, and links can be combined to create custom dashboards. +The user defined text, graphics, tables, and links create custom dashboards. Pecos includes dashboard examples in the `examples/dashboard `_ directory. :numref:`fig-dashboard1`, :numref:`fig-dashboard2`, and :numref:`fig-dashboard3` show example dashboards generated using Pecos. @@ -255,10 +258,10 @@ These graphics can be included in :ref:`monitoring_reports`. Example test results graphic. Day-of-year vs. time-of-day heatmaps, generated using :class:`~pecos.graphics.plot_doy_heatmap`, -can help identify missing data, trends, define filters and define quality control test thresholds when working with large data sets. +help identify missing data, trends, define filters and define quality control test thresholds when working with large data sets. The following figure shows irradiance over a year with the time of sunrise and sunset for each day. The white vertical line indicates one day of missing data. -The method :class:`~pecos.graphics.plot_heatmap` can be used to create simple heatmaps. +The method :class:`~pecos.graphics.plot_heatmap` creates a simple heatmaps. These plots can be included as custom graphics in :ref:`monitoring_reports` and :ref:`dashboards`. .. _fig-doy-heatmap: diff --git a/documentation/scheduler.rst b/documentation/scheduler.rst deleted file mode 100644 index faf3de9..0000000 --- a/documentation/scheduler.rst +++ /dev/null @@ -1,16 +0,0 @@ -Task scheduler -=============== - -To run Pecos on an automated schedule, create a task using your operating systems. -On Windows, open the Control Panel and search for *Schedule Tasks*. -On Linux and OSX, use the *cron* utility. - -Tasks are defined by a trigger and an action. -The trigger indicates when the task should be run (i.e. Daily at 1:00 pm). -The action can be set to run a batch file. -A batch file (.bat or .cmd filename extension) can be easily -written to start a Python script which runs Pecos. -For example, the following batch file runs driver.py:: - - cd your_working_directory - C:\Python27\python.exe driver.py diff --git a/documentation/timefilter.rst b/documentation/timefilter.rst index b49e81c..23df26c 100644 --- a/documentation/timefilter.rst +++ b/documentation/timefilter.rst @@ -23,15 +23,15 @@ The following example defines a time filter between 3 AM and 9 PM, .. doctest:: - >>> clocktime = pecos.utils.datetime_to_clocktime(pm.df.index) + >>> clocktime = pecos.utils.datetime_to_clocktime(pm.data.index) >>> time_filter = pd.Series((clocktime > 3*3600) & (clocktime < 21*3600), - ... index=pm.df.index) + ... index=pm.data.index) The time filter can also be defined based on properties of the DataFrame, for example, .. doctest:: - >>> time_filter = pm.df['A'] > 0.5 + >>> time_filter = pm.data['A'] > 0.5 For some applications, it is useful to define the time filter based on sun position, as demonstrated in **pv_example.py** in the diff --git a/documentation/timeseries.rst b/documentation/timeseries.rst index c23cdbb..23f3f16 100644 --- a/documentation/timeseries.rst +++ b/documentation/timeseries.rst @@ -4,7 +4,7 @@ Time series data Pecos uses Pandas DataFrames to store and analyze data indexed by time. Pandas DataFrames store 2D data with labeled columns. Pandas includes a wide range of time series analysis and date-time functionality. By using Pandas DataFrames, -Pecos is able to take advantage of a wide range of timestamp strings, including +Pecos is able to take advantage of a wide range of timestamp string formats, including UTC offset. Pandas includes many built-in functions to read data from CSV, Excel, SQL, etc. @@ -27,7 +27,7 @@ To get started, create an instance of the :class:`~pecos.monitoring.PerformanceM >>> import pecos >>> pm = pecos.monitoring.PerformanceMonitoring() -Data, in the form of a Pandas Dataframe, can then be added to the PerformanceMonitoring object. +Data, in the form of a Pandas DataFrame, can then be added to the PerformanceMonitoring object. .. doctest:: :hide: @@ -44,7 +44,7 @@ The data is accessed using .. doctest:: - >>> pm.df #doctest:+SKIP + >>> pm.data #doctest:+SKIP Multiple DataFrames can be added to the PerformanceMonitoring object. New data overrides existing data if DataFrames share indexes and columns. @@ -80,7 +80,7 @@ Missing indexes and columns are filled with NaN. An example is shown below. >>> pm.add_dataframe(data1) >>> pm.add_dataframe(data2) - >>> print(pm.df) + >>> print(pm.data) A B C 2018-01-01 0.0 5.0 NaN 2018-01-02 1.0 0.0 5.0 diff --git a/documentation/translation.rst b/documentation/translation.rst index 1f09742..2c41773 100644 --- a/documentation/translation.rst +++ b/documentation/translation.rst @@ -1,10 +1,10 @@ Translation dictionary ----------------------- A translation dictionary is an optional feature which allows the user to map original -column names into common names that can be more useful for analysis. +column names into common names that can be more useful for analysis and reporting. A translation dictionary can also be used to group columns with similar properties into a single variable. -Using grouped variables, Pecos can run a signal set of quality control tests on the group. +Using grouped variables, Pecos can run a single set of quality control tests on the group. Each entry in a translation dictionary is a key:value pair where 'key' is the common name of the data and 'value' is a list of original column names in the DataFrame. @@ -51,7 +51,7 @@ Inside Pecos, the translation dictionary is used to index into the DataFrame, fo .. doctest:: - >>> pm.df[pm.trans['Wave']] #doctest:+SKIP + >>> pm.data[pm.trans['Wave']] #doctest:+SKIP returns columns 'C' and 'D' from the DataFrame. diff --git a/documentation/whatsnew.rst b/documentation/whatsnew.rst index 9fdd617..8fea40d 100644 --- a/documentation/whatsnew.rst +++ b/documentation/whatsnew.rst @@ -1,7 +1,7 @@ Release Notes ================ -.. include:: whatsnew/v0.1.8.1.rst +.. include:: whatsnew/v0.1.9.rst .. include:: whatsnew/v0.1.8.rst diff --git a/documentation/whatsnew/v0.1.8.1.rst b/documentation/whatsnew/v0.1.8.1.rst deleted file mode 100644 index 94d8f7e..0000000 --- a/documentation/whatsnew/v0.1.8.1.rst +++ /dev/null @@ -1,9 +0,0 @@ -.. _whatsnew_0181: - -v0.1.8.1 (master) --------------------------- - -* Bug fix in the way masks are generated. Data points that have Null values were always assigned to False, indicating - that a quality control test failed. Null values are now assumed to be True, unless a specific test fails (e.g. check_missing). - - diff --git a/documentation/whatsnew/v0.1.9.rst b/documentation/whatsnew/v0.1.9.rst new file mode 100644 index 0000000..8bde8ef --- /dev/null +++ b/documentation/whatsnew/v0.1.9.rst @@ -0,0 +1,24 @@ +.. _whatsnew_019: + +v0.1.9 (master) +-------------------------- + +* Added the ability to use custom quality control test functions in static or streaming analysis. The methods, ``check_custom_static`` and ``check_custom_streaming``, allow the user to supply a custom function that is used to determine if data is anomalous. The custom tests also allow the user to return metadata that contains information about the quality control test. + + * The streaming analysis loops through the data using a moving window to determine if data point is normal or anomalous. If the data point is deemed anomalous it is omitted from the history and not used to determine the status of subsequent data points. + * The static analysis operates on the entire data set, and while it can include operations like moving windows, it does not update the history based on the test results. + +* The following input arguments were changed or added: + + * In ``check_outlier``, the input argument `window` was changed to None (not used), `absolute value` was changed to False, and an input argument `streaming` was added to use streaming analysis (default value is False). Changed the order of `key` and `window` to be more consistent with other quality control tests. + * In ``check_delta``, the input argument `window` is no longer optional + +* Added property ``data`` to the PerformanceMonitoring class. pm.data is equivalent to pm.df (pm.df was retained for backward compatibility) +* Added the ability to create monitoring reports using a LaTeX template. Small changes in the HTML template. +* Added the option to specify a date format string to timeseries plots. +* Fixed a bug in the way masks are generated. Data points that have Null values were being assigned to False, indicating + that a quality control test failed. Null values are now assumed to be True, unless a specific test fails (e.g. check_missing). +* Updated the boolean mask used in the code to have a consistent definition (True = data point pass all tests, False = data point did not pass at least one test.) +* Added an example in the docs to illustrate analysis of continuous data +* Added Python 3.8 tests +* Updated documentation and tests diff --git a/examples/dashboard/dashboard_example_1.py b/examples/dashboard/dashboard_example_1.py index b71712c..c0b1063 100644 --- a/examples/dashboard/dashboard_example_1.py +++ b/examples/dashboard/dashboard_example_1.py @@ -50,10 +50,10 @@ pm.check_timestamp(specs['Frequency']) # Generate a time filter - clock_time = pecos.utils.datetime_to_clocktime(pm.df.index) + clock_time = pecos.utils.datetime_to_clocktime(pm.data.index) time_filter = pd.Series((clock_time > specs['Time Filter Min']*3600) & \ (clock_time < specs['Time Filter Max']*3600), - index=pm.df.index) + index=pm.data.index) pm.add_time_filter(time_filter) # Check missing @@ -77,14 +77,14 @@ report_file = os.path.join(results_subdirectory, 'monitoring_report.html') # Generate graphics - test_results_graphics = pecos.graphics.plot_test_results(pm.df, + test_results_graphics = pecos.graphics.plot_test_results(pm.data, pm.test_results, filename_root=graphics_file_rootname) df.plot() plt.savefig(custom_graphics_file, format='png', dpi=500) # Write test results and report files pecos.io.write_test_results(pm.test_results, test_results_file) - pecos.io.write_monitoring_report(pm.df, pm.test_results, test_results_graphics, + pecos.io.write_monitoring_report(pm.data, pm.test_results, test_results_graphics, [custom_graphics_file], QCI, filename=report_file) # Store content to be displayed in the dashboard diff --git a/examples/dashboard/dashboard_example_2.py b/examples/dashboard/dashboard_example_2.py index 975f122..0d58b9b 100644 --- a/examples/dashboard/dashboard_example_2.py +++ b/examples/dashboard/dashboard_example_2.py @@ -54,7 +54,7 @@ report_file = os.path.join(results_subdirectory, 'monitoring_report.html') # Generate graphics - test_results_graphics = pecos.graphics.plot_test_results(pm.df, pm.test_results, + test_results_graphics = pecos.graphics.plot_test_results(pm.data, pm.test_results, pm.tfilter, filename_root=graphics_file_rootname) pecos.graphics.plot_heatmap(QCI, vmin=0, vmax=1) plt.savefig(colorblock_graphics_file, dpi=90, bbox_inches='tight', pad_inches = 0) @@ -64,7 +64,7 @@ # Write test results and report files pecos.io.write_test_results(pm.test_results, test_results_file) - pecos.io.write_monitoring_report(pm.df, pm.test_results, test_results_graphics, + pecos.io.write_monitoring_report(pm.data, pm.test_results, test_results_graphics, [custom_graphics_file], QCI, filename=report_file) # Store content to be displayed in the dashboard diff --git a/examples/metrics/metrics_example.py b/examples/metrics/metrics_example.py index d8ea69c..64aecc4 100644 --- a/examples/metrics/metrics_example.py +++ b/examples/metrics/metrics_example.py @@ -38,11 +38,11 @@ pm.check_increment([-0.5, None], absolute_value=False) # Generate graphics -test_results_graphics = pecos.graphics.plot_test_results(pm.df, pm.test_results) +test_results_graphics = pecos.graphics.plot_test_results(pm.data, pm.test_results) df.plot(ylim=[-0.2,1.2], figsize=(10.0,4.0)) plt.savefig('custom.png', format='png', dpi=500) # Write test results and report files pecos.io.write_test_results(pm.test_results) -pecos.io.write_monitoring_report(pm.df, pm.test_results, test_results_graphics, +pecos.io.write_monitoring_report(pm.data, pm.test_results, test_results_graphics, ['custom.png'], title='System1 2015') diff --git a/examples/pv/pv_example.py b/examples/pv/pv_example.py index fac9fb0..173d7a4 100644 --- a/examples/pv/pv_example.py +++ b/examples/pv/pv_example.py @@ -58,7 +58,7 @@ pm.check_timestamp(60) # Generate a time filter based on sun position -solarposition = pvlib.solarposition.ephemeris(pm.df.index, location['Latitude'], +solarposition = pvlib.solarposition.ephemeris(pm.data.index, location['Latitude'], location['Longitude']) time_filter = solarposition['apparent_elevation'] > 10 pm.add_time_filter(time_filter) @@ -72,7 +72,7 @@ # Add composite signals for composite_signal in composite_signals: for key,value in composite_signal.items(): - signal = pecos.utils.evaluate_string(value, data=pm.df, + signal = pecos.utils.evaluate_string(value, data=pm.data, trans=pm.trans, col_name=key) pm.add_dataframe(signal) pm.add_translation_dictionary({key: list(signal.columns)}) @@ -100,13 +100,13 @@ metrics = pd.Series(metrics) # Generate graphics -test_results_graphics = pecos.graphics.plot_test_results(pm.df, pm.test_results, pm.tfilter) -pm.df[pm.trans['DC Power']].plot(figsize=(7,3.5)) +test_results_graphics = pecos.graphics.plot_test_results(pm.data, pm.test_results, pm.tfilter) +pm.data[pm.trans['DC Power']].plot(figsize=(7,3.5)) plt.savefig('custom1.png', format='png', dpi=500) -pm.df[['Diffuse_Wm2_Avg', 'Direct_Wm2_Avg', 'Global_Wm2_Avg']].plot(figsize=(7,3.5)) +pm.data[['Diffuse_Wm2_Avg', 'Direct_Wm2_Avg', 'Global_Wm2_Avg']].plot(figsize=(7,3.5)) plt.savefig('custom2.png', format='png', dpi=500) # Write test results and report files pecos.io.write_test_results(pm.test_results) -pecos.io.write_monitoring_report(pm.df, pm.test_results, test_results_graphics, +pecos.io.write_monitoring_report(pm.data, pm.test_results, test_results_graphics, ['custom1.png', 'custom2.png'], metrics) diff --git a/examples/pv/pv_model.py b/examples/pv/pv_model.py index 9cc03bf..74a4af8 100644 --- a/examples/pv/pv_model.py +++ b/examples/pv/pv_model.py @@ -25,7 +25,7 @@ def sapm(pm, sapm_parameters, location): location['Longitude']) # Compute cell temperature - celltemp = pvlib.pvsystem.sapm_celltemp(poa, wind, temperature) + celltemp = pvlib.temperature.sapm_cell(poa, wind, temperature, -3.47, -0.0594, 3) # Compute absolute airmass airmass_relative = pvlib.atmosphere.get_relative_airmass(solarposition['zenith']) @@ -40,7 +40,7 @@ def sapm(pm, sapm_parameters, location): aoi, sapm_parameters) # Run SAPM - sapm_model = pvlib.pvsystem.sapm(Ee, celltemp['temp_cell'], sapm_parameters) + sapm_model = pvlib.pvsystem.sapm(Ee, celltemp, sapm_parameters) # Compute the relative error between observed and predicted DC Power. # Add the composite signal and run a range test @@ -75,7 +75,7 @@ def sapm(pm, sapm_parameters, location): # Compute clearness index dni_insolation = pecos.pv.insolation(dni, tfilter=pm.tfilter) dni_insolation = dni_insolation.values[0] - ea = pvlib.irradiance.extraradiation(index.dayofyear) + ea = pvlib.irradiance.get_extra_radiation(index.dayofyear) ea = pd.Series(index=index, data=ea) ea_insolation = pecos.pv.insolation(ea, tfilter=pm.tfilter) ea_insolation = ea_insolation.values[0] diff --git a/examples/simple/simple_example.py b/examples/simple/simple_example.py index 53a05a4..13183ae 100644 --- a/examples/simple/simple_example.py +++ b/examples/simple/simple_example.py @@ -31,9 +31,9 @@ pm.check_timestamp(900) # Generate a time filter to exclude data points early and late in the day -clock_time = pecos.utils.datetime_to_clocktime(pm.df.index) +clock_time = pecos.utils.datetime_to_clocktime(pm.data.index) time_filter = pd.Series((clock_time > 3*3600) & (clock_time < 21*3600), - index=pm.df.index) + index=pm.data.index) pm.add_time_filter(time_filter) # Check for missing data @@ -44,7 +44,7 @@ # Add a composite signal which compares measurements to a model wave_model = np.array(np.sin(10*clock_time/86400)) -wave_measurments = pm.df[pm.trans['Wave']] +wave_measurments = pm.data[pm.trans['Wave']] wave_error = np.abs(wave_measurments.subtract(wave_model,axis=0)) wave_error.columns=['Wave Error C', 'Wave Error D'] pm.add_dataframe(wave_error) @@ -56,9 +56,9 @@ pm.check_range([None, 0.25], 'Wave Error') # Check for stagnant data within a 1 hour moving window -pm.check_delta([0.0001, None], 'A', 3600) -pm.check_delta([0.0001, None], 'B', 3600) -pm.check_delta([0.0001, None], 'Wave', 3600) +pm.check_delta([0.0001, None], 3600, 'A') +pm.check_delta([0.0001, None], 3600, 'B') +pm.check_delta([0.0001, None], 3600, 'Wave') # Check for abrupt changes between consecutive time steps pm.check_increment([None, 0.6], 'Wave') @@ -68,12 +68,12 @@ QCI = pecos.metrics.qci(mask, pm.tfilter) # Generate graphics -test_results_graphics = pecos.graphics.plot_test_results(pm.df, pm.test_results) +test_results_graphics = pecos.graphics.plot_test_results(pm.data, pm.test_results, pm.tfilter) df.plot(ylim=[-1.5,1.5], figsize=(7.0,3.5)) plt.savefig('custom.png', format='png', dpi=500) # Write test results and report files pecos.io.write_test_results(pm.test_results) -pecos.io.write_monitoring_report(pm.df, pm.test_results, test_results_graphics, +pecos.io.write_monitoring_report(pm.data, pm.test_results, test_results_graphics, ['custom.png'], QCI) \ No newline at end of file diff --git a/examples/simple/simple_example_using_config.py b/examples/simple/simple_example_using_config.py index a2b6a6d..2cc50e6 100644 --- a/examples/simple/simple_example_using_config.py +++ b/examples/simple/simple_example_using_config.py @@ -29,7 +29,7 @@ pm.check_timestamp(config['Specifications']['Frequency']) # Generate a time filter to exclude data points early and late in the day -time_filter = pecos.utils.evaluate_string(config['Time Filter'], df) +time_filter = pecos.utils.evaluate_string(config['Time Filter'], pm.data) pm.add_time_filter(time_filter) # Check for missing data @@ -42,7 +42,7 @@ specs = config['Specifications'] for composite_signal in config['Composite Signals']: for key, value in composite_signal.items(): - signal = pecos.utils.evaluate_string(value, pm.df, pm.trans, specs, key) + signal = pecos.utils.evaluate_string(value, pm.data, pm.trans, specs, key) pm.add_dataframe(signal) pm.add_translation_dictionary({key: list(signal.columns)}) @@ -52,7 +52,7 @@ # Check for stagnant data within a 1 hour moving window for key,value in config['Delta'].items(): - pm.check_delta(value, key, 3600) + pm.check_delta(value, 3600, key) # Check for abrupt changes between consecutive time steps for key,value in config['Increment'].items(): @@ -63,11 +63,11 @@ QCI = pecos.metrics.qci(mask, pm.tfilter) # Generate graphics -test_results_graphics = pecos.graphics.plot_test_results(pm.df, pm.test_results) +test_results_graphics = pecos.graphics.plot_test_results(pm.data, pm.test_results, pm.tfilter) df.plot(ylim=[-1.5,1.5], figsize=(7.0,3.5)) plt.savefig('custom.png', format='png', dpi=500) # Write test results and report files pecos.io.write_test_results(pm.test_results) -pecos.io.write_monitoring_report(pm.df, pm.test_results, test_results_graphics, +pecos.io.write_monitoring_report(pm.data, pm.test_results, test_results_graphics, ['custom.png'], QCI) diff --git a/pecos/__init__.py b/pecos/__init__.py index 920b305..cc7687e 100644 --- a/pecos/__init__.py +++ b/pecos/__init__.py @@ -6,9 +6,9 @@ from pecos import utils from pecos import pv -__version__ = '0.1.8.1' +__version__ = '0.1.9' -__copyright__ = """Copyright 2016-2020 National Technology & Engineering +__copyright__ = """Copyright 2016 National Technology & Engineering Solutions of Sandia, LLC (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the U.S. Government retains certain rights in this software.""" diff --git a/pecos/graphics.py b/pecos/graphics.py index 7a2ed9b..39ff745 100644 --- a/pecos/graphics.py +++ b/pecos/graphics.py @@ -6,6 +6,7 @@ import numpy as np try: import matplotlib.pyplot as plt + from matplotlib.dates import DateFormatter except: pass try: @@ -21,7 +22,9 @@ except ImportError: def _nottest(afunction): return afunction - + +NoneType = type(None) + logger = logging.getLogger(__name__) def plot_scatter(x,y,xaxis_min=None, xaxis_max=None, yaxis_min=None, @@ -40,22 +43,22 @@ def plot_scatter(x,y,xaxis_min=None, xaxis_max=None, yaxis_min=None, y : pandas DataFrame Y data - xaxis_min : float (optional) + xaxis_min : float, optional X-axis minimum, default = None (autoscale) - xaxis_max : float (optional) + xaxis_max : float, optional X-axis maximum, default = None (autoscale) - yaxis_min : float (optional) + yaxis_min : float, optional Y-axis minimum, default = None (autoscale) - yaxis_max : float (optional) + yaxis_max : float, optional Y-axis maximum, default = None (autoscale) - title : string (optional) + title : string, optional Title, default = None - figsize : tuple (optional) + figsize : tuple, optional Figure size, default = (7.0, 3.0) """ @@ -100,41 +103,47 @@ def plot_scatter(x,y,xaxis_min=None, xaxis_max=None, yaxis_min=None, def plot_timeseries(data, tfilter=None, test_results_group=None, xaxis_min=None, xaxis_max=None, yaxis_min=None, yaxis_max=None, title=None, - figsize=(7.0, 3.0)): + figsize=(7.0, 3.0), date_formatter=None): """ Create a time series plot using each column in the DataFrame. Parameters ---------- - data : pandas DataFrame + data : pandas DataFrame or Series Data, indexed by time - tfilter : pandas Series (optional) + tfilter : pandas Series, optional Boolean values used to include time filter in the plot, default = None - test_results_group : pandas DataFrame (optional) - Test results for a particular variable. + test_results_group : pandas DataFrame, optional + Test results for the data default = None - xaxis_min : float (optional) + xaxis_min : float, optional X-axis minimum, default = None (autoscale) - xaxis_max : float (optional) + xaxis_max : float, optional X-axis maximum, default = None (autoscale) - yaxis_min : float (optional) + yaxis_min : float, optional Y-axis minimum, default = None (autoscale) - yaxis_max : float (optional) + yaxis_max : float, optional Y-axis maximum, default = None (autoscale) - title : string (optional) + title : string, optional Title, default = None - figsize : tuple (optional) + figsize : tuple, optional Figure size, default = (7.0, 3.0) + + date_formatter : string, optional + Date formatter used on the x axis, for example, "%m-%d". Default = None """ + assert isinstance(data, (pd.Series, pd.DataFrame)) + assert isinstance(tfilter, (NoneType, pd.Series)) + plt.figure(figsize = figsize) ax = plt.gca() @@ -147,7 +156,7 @@ def plot_timeseries(data, tfilter=None, test_results_group=None, xaxis_min=None, data.plot(ax=ax, linewidth=1, grid=False, legend=False, fontsize=8, rot=90, label='Data') - if tfilter is not None: + if isinstance(tfilter, pd.Series): # add tfilter temp = np.where(tfilter - tfilter.shift()) temp = np.append(temp[0],len(tfilter)-1) @@ -169,7 +178,7 @@ def plot_timeseries(data, tfilter=None, test_results_group=None, xaxis_min=None, except: pass if test_results_group is not None: - key2 = test_results_group['Error Flag'] + key2 = test_results_group['Error Flag'].fillna('') grouped2 = test_results_group.groupby(key2) for error_flag in key2.unique(): @@ -243,6 +252,10 @@ def plot_timeseries(data, tfilter=None, test_results_group=None, xaxis_min=None, plt.xlabel('Time', fontsize=8) box = ax.get_position() ax.set_position([box.x0, box.y0+0.15, box.width, box.height*0.75]) + + if date_formatter is not None: + date_form = DateFormatter(date_formatter) + ax.xaxis.set_major_formatter(date_form) def plot_interactive_timeseries(data, xaxis_min=None, xaxis_max=None, yaxis_min=None, yaxis_max=None, title=None, filename=None, auto_open=True): @@ -255,25 +268,25 @@ def plot_interactive_timeseries(data, xaxis_min=None, xaxis_max=None, yaxis_min= data : pandas DataFrame Data, indexed by time - xaxis_min : float (optional) + xaxis_min : float, optional X-axis minimum, default = None (autoscale) - xaxis_max : float (optional) + xaxis_max : float, optional X-axis maximum, default = None (autoscale) - yaxis_min : float (optional) + yaxis_min : float, optional Y-axis minimum, default = None (autoscale) - yaxis_max : float (optional) + yaxis_max : float, optional Y-axis maximum, default = None (autoscale) - title : string (optional) + title : string, optional Title, default = None filename : string, optional HTML file name, default = None (file will be named temp-plot.html) - auto_open : boolean (optional) + auto_open : boolean, optional Flag indicating if HTML graphic is opened, default = True """ @@ -306,27 +319,27 @@ def plot_heatmap(data, colors=[(0.75, 0.15, 0.15), (1, 0.75, 0.15), (0.15, 0.75, data : pandas DataFrame, pandas Series, or numpy array Data - colors : list (optional) + colors : list, optional List of colors, colors can be specified in any way understandable by matplotlib.colors.ColorConverter.to_rgb(). Default is red to yellow to green. - num_colors : int (optional) + num_colors : int, optional Number of colors in the colormap, default = 12 - cmap : string (optional) + cmap : string, optional Colormap, default = None. Overrides colors and num_colors listed above. - vmin : float (optional) + vmin : float, optional Colomap minimum, default = None (autoscale) - vmax : float (optional) + vmax : float, optional Colomap maximum, default = None (autoscale) - title : string (optional) + title : string, optional Title, default = None - figsize : tuple (optional) + figsize : tuple, optional Figure size, default = (5.0, 5.0) """ if isinstance(data, (pd.DataFrame, pd.Series)): @@ -358,24 +371,24 @@ def plot_doy_heatmap(data, cmap='nipy_spectral', vmin=None, vmax=None, data : pandas DataFrame or pandas Series Data (single column), indexed by time - cmap : string (optional) + cmap : string, optional Colomap, default = nipy_spectral - vmin : float (optional) + vmin : float, optional Colomap minimum, default = None (autoscale) - vmax : float (optional) + vmax : float, optional Colomap maximum, default = None (autoscale) - overlay : pandas DataFrame (optional) + overlay : pandas DataFrame, optional Data to overlay on the heatmap. Time index should be in day-of-year (X-axis) Values should be in time-of-day in minutes (Y-axis) - title : string (optional) + title : string, optional Title, default = None - figsize : tuple (optional) + figsize : tuple, optional Figure size, default = (7.0, 3.0) """ @@ -412,7 +425,8 @@ def plot_doy_heatmap(data, cmap='nipy_spectral', vmin=None, vmax=None, @_nottest def plot_test_results(data, test_results, tfilter=None, image_format='png', - dpi=500, figsize=(7.0,3.0), filename_root='test_results'): + dpi=500, figsize=(7.0,3.0), date_formatter=None, + filename_root='test_results'): """ Create test results graphics which highlight data points that failed a quality control test. @@ -420,24 +434,27 @@ def plot_test_results(data, test_results, tfilter=None, image_format='png', Parameters ---------- data : pandas DataFrame - Data, indexed by time (pm.df) + Data, indexed by time (pm.data) test_results : pandas DataFrame Summary of the quality control test results (pm.test_results) - tfilter : pandas Series (optional) + tfilter : pandas Series, optional Boolean values used to include time filter in the plot, default = None - image_format : string (optional) + image_format : string , optional Image format, default = 'png' - dpi : int (optional) + dpi : int, optional DPI resolution, default = 500 - figsize : tuple (optional) + figsize : tuple, optional Figure size, default = (7.0,3.0) - filename_root : string (optional) + date_formatter : string, optional + Date formatter used on the x axis, for example, "%m-%d". Default = None + + filename_root : string, optional File name root. If the full path is not provided, files are saved into the current working directory. Each graphic filename is appended with an integer. For example, filename_root = 'test' will generate a files named 'test0.png', @@ -475,8 +492,10 @@ def plot_test_results(data, test_results, tfilter=None, image_format='png', for col_name, test_results_group in grouped: logger.info("Creating graphic for " + col_name) + plot_timeseries(data[col_name], tfilter, - test_results_group=test_results_group, figsize=figsize) + test_results_group=test_results_group, figsize=figsize, + date_formatter=date_formatter) ax = plt.gca() box = ax.get_position() diff --git a/pecos/io.py b/pecos/io.py index 695f475..beb18d4 100644 --- a/pecos/io.py +++ b/pecos/io.py @@ -41,10 +41,10 @@ def read_campbell_scientific(filename, index_col='TIMESTAMP', encoding=None): filename : string File name - index_col : string (optional) + index_col : string, optional Index column name, default = 'TIMESTAMP' - encoding : string (optional) + encoding : string, optional Character encoding (i.e. utf-16) Returns @@ -93,16 +93,16 @@ def send_email(subject, body, recipient, sender, attachment=None, sender : string Sender email address - attachment : string (optional) + attachment : string, optional Name of file to attach - host : string (optional) + host : string, optional Name of email host (or host:port), default = 'localhost' - username : string (optional) + username : string, optional Email username for authentication - password : string (optional) + password : string, optional Email password for authentication """ @@ -153,7 +153,7 @@ def write_metrics(metrics, filename='metrics.csv'): metrics : pandas DataFrame Data to add to the metrics file - filename : string (optional) + filename : string, optional File name. If the full path is not provided, the file is saved into the current working directory. By default, the file is named 'metrics.csv' @@ -192,7 +192,7 @@ def write_test_results(test_results, filename='test_results.csv'): test_results : pandas DataFrame Summary of the quality control test results (pm.test_results) - filename : string (optional) + filename : string, optional File name. If the full path is not provided, the file is saved into the current working directory. By default, the file is named 'test_results.csv' @@ -220,7 +220,8 @@ def write_test_results(test_results, filename='test_results.csv'): def write_monitoring_report(data, test_results, test_results_graphics=[], custom_graphics=[], metrics=None, title='Pecos Monitoring Report', config={}, logo=False, - im_width_test_results=700, im_width_custom=700, encode=False, + im_width_test_results=1, im_width_custom=1, im_width_logo=0.1, + encode=False, file_format='html', filename='monitoring_report.html'): """ Generate a monitoring report. @@ -230,40 +231,43 @@ def write_monitoring_report(data, test_results, test_results_graphics=[], Parameters ---------- data : pandas DataFrame - Data, indexed by time (pm.df) + Data, indexed by time (pm.data) test_results : pandas DataFrame Summary of the quality control test results (pm.test_results) - test_results_graphics : list of strings (optional) + test_results_graphics : list of strings, optional Graphics files, with full path. These graphics highlight data points that failed a quality control test, created using pecos.graphics.plot_test_results() - custom_graphics : list of strings (optional) + custom_graphics : list of strings, optional Custom files, with full path. Created by the user. - metrics : pandas Series or DataFrame (optional) + metrics : pandas Series or DataFrame, optional Performance metrics to add as a table to the monitoring report - title : string (optional) + title : string, optional Monitoring report title, default = 'Pecos Monitoring Report' - config : dictionary (optional) + config : dictionary, optional Configuration options, to be printed at the end of the report - logo : string (optional) + logo : string, optional Graphic to be added to the report header - im_width_test_results=700 : float (optional) - Image width for test results graphics in the HTML report, default = 700 + im_width_test_results : float, optional + Image width as a fraction of page size, for test results graphics, default = 1 - im_width_custom=700 : float (optional) - Image width for custom graphics in the HTML report, default = 700 + im_width_custom : float, optional + Image width as a fraction of page size, for custom graphics, default = 1 - encode : boolean (optional) + im_width_logo: float, optional + Image width as a fraction of page size, for the logo, default = 0.1 + + encode : boolean, optional Encode graphics in the html, default = False - filename : string (optional) + filename : string, optional File name. If the full path is not provided, the file is saved into the current working directory. By default, the file is named 'monitoring_report.html' @@ -283,7 +287,7 @@ def write_monitoring_report(data, test_results, test_results_graphics=[], end_time = data.index[-1] # Set pandas display option - pd.set_option('display.max_colwidth', -1) + pd.set_option('display.max_colwidth', None) pd.set_option('display.width', 40) # Collect notes (from the logger file) @@ -303,38 +307,63 @@ def write_monitoring_report(data, test_results, test_results_graphics=[], # Convert to html format if metrics is None: metrics = pd.DataFrame() - if isinstance(metrics, pd.Series): - metrics_html = metrics.to_frame().to_html(header=False) - if isinstance(metrics, pd.DataFrame): - metrics_html = metrics.to_html(justify='left') - - test_results_html = test_results.to_html(justify='left') - notes_html = notes_df.to_html(justify='left', header=False) + + pecos_logo = join(dirname(pecos.__file__), '..', 'documentation', 'figures', 'logo.png') content = {'start_time': str(start_time), - 'end_time': str(end_time), - 'num_notes': str(notes_df.shape[0]), - 'notes': notes_html, - 'num_test_results': str(test_results.shape[0]), - 'test_results': test_results_html, - 'test_results_graphics': test_results_graphics, - 'custom_graphics': custom_graphics, - 'num_metrics': str(metrics.shape[0]), - 'metrics': metrics_html, - 'config': config} + 'end_time': str(end_time), + 'num_notes': str(notes_df.shape[0]), + 'num_data_columns': str(data.shape[1]), + 'num_test_results': str(test_results.shape[0]), + 'num_metrics': str(metrics.shape[0]), + 'config': config} title = os.path.basename(title) - html_string = _html_template_monitoring_report(content, title, logo, im_width_test_results, im_width_custom, encode) + if file_format == 'html': + content['test_results_graphics'] = test_results_graphics + content['custom_graphics'] = custom_graphics + content['pecos_logo'] = pecos_logo + + if isinstance(metrics, pd.Series): + metrics_html = metrics.to_frame().to_html(header=False) + if isinstance(metrics, pd.DataFrame): + metrics_html = metrics.to_html(justify='left') + + content['metrics'] = metrics_html + content['test_results'] = test_results.to_html(justify='left') + content['notes'] = notes_df.to_html(justify='left', header=False) + + im_width_test_results = im_width_test_results*800 + im_width_custom = im_width_custom*800 + im_width_logo = im_width_logo*800 + + file_string = _html_template_monitoring_report(content, title, logo, + im_width_test_results, im_width_custom, im_width_logo, encode) + else: + test_results_graphics = [g.replace('\\', '/') for g in test_results_graphics] + custom_graphics = [g.replace('\\', '/') for g in custom_graphics] + pecos_logo = pecos_logo.replace('\\', '/') + + content['test_results_graphics'] = test_results_graphics + content['custom_graphics'] = custom_graphics + content['pecos_logo'] = pecos_logo + + content['metrics'] = metrics.to_latex(longtable=True) + content['test_results'] = test_results.to_latex(longtable=True) + content['notes'] = notes_df.to_latex(longtable=True) + + file_string = _latex_template_monitoring_report(content, title, logo, + im_width_test_results, im_width_custom, im_width_logo) - # Write html file + # Write file if os.path.dirname(filename) == '': full_filename = os.path.join(os.getcwd(), filename) else: full_filename = filename - html_file = open(full_filename,"w") - html_file.write(html_string) - html_file.close() + fid = open(full_filename,"w") + fid.write(file_string) + fid.close() logger.info("") @@ -377,25 +406,25 @@ def write_dashboard(column_names, row_names, content, title='Pecos Dashboard', 'table': df.to_html(), 'link': {'Link to monitoring report': 'C:\\\\pecos\\\\results\\\\monitoring_report.html'}} - title : string (optional) + title : string, optional Dashboard title, default = 'Pecos Dashboard' - footnote : string (optional) + footnote : string, optional Text to be added to the end of the report - logo : string (optional) + logo : string, optional Graphic to be added to the report header - im_width : float (optional) + im_width : float, optional Image width in the HTML report, default = 250 - datatables : boolean (optional) + datatables : boolean, optional Use datatables.net to format the dashboard, default = False. See https://datatables.net/ for more information. - encode : boolean (optional) + encode : boolean, optional Encode graphics in the html, default = False - filename : string (optional) + filename : string, optional File name. If the full path is not provided, the file is saved into the current working directory. By default, the file is named 'dashboard.html' @@ -426,7 +455,18 @@ def write_dashboard(column_names, row_names, content, title='Pecos Dashboard', return full_filename -def _html_template_monitoring_report(content, title, logo, im_width_test_results, im_width_custom, encode): +def _latex_template_monitoring_report(content, title, logo, im_width_test_results, im_width_custom, im_width_logo): + + template = env.get_template('monitoring_report.tex') + + date = datetime.datetime.now() + datestr = date.strftime('%m/%d/%Y') + + version = pecos.__version__ + + return template.render(**locals()) + +def _html_template_monitoring_report(content, title, logo, im_width_test_results, im_width_custom, im_width_logo, encode): # if encode == True, encode the images img_dic = {} @@ -437,6 +477,9 @@ def _html_template_monitoring_report(content, title, logo, im_width_test_results for im in content['test_results_graphics']: img_encode = base64.b64encode(open(im, "rb").read()).decode("utf-8") img_dic[im] = img_encode + im = content['pecos_logo'] + img_encode = base64.b64encode(open(im, "rb").read()).decode("utf-8") + img_dic[im] = img_encode template = env.get_template('monitoring_report.html') diff --git a/pecos/metrics.py b/pecos/metrics.py index f042fa5..3b5d558 100644 --- a/pecos/metrics.py +++ b/pecos/metrics.py @@ -25,7 +25,7 @@ def qci(mask, tfilter=None): mask : pandas DataFrame Test results mask, returned from pm.mask - tfilter : pandas Series (optional) + tfilter : pandas Series, optional Time filter containing boolean values for each time index Returns @@ -60,7 +60,7 @@ def rmse(data1, data2, tfilter=None): data2 : pandas DataFrame Data. Note, the column names in data1 must equal the column names in data2 - tfilter : pandas Series (optional) + tfilter : pandas Series, optional Time filter containing boolean values for each time index Returns @@ -103,7 +103,7 @@ def time_integral(data, tfilter=None): data : pandas DataFrame Data - tfilter : pandas Series (optional) + tfilter : pandas Series, optional Time filter containing boolean values for each time index Returns @@ -147,7 +147,7 @@ def time_derivative(data, tfilter=None): data : pandas DataFrame Data - tfilter : pandas Series (optional) + tfilter : pandas Series, optional Filter containing boolean values for each time index Returns @@ -189,7 +189,7 @@ def probability_of_detection(observed, actual, tfilter=None): Actual conditions, (True = background, False = anomalous). Note, the column names in observed must equal the column names in actual - tfilter : pandas Series (optional) + tfilter : pandas Series, optional Filter containing boolean values for each time index Returns @@ -247,7 +247,7 @@ def false_alarm_rate(observed, actual, tfilter=None): Actual conditions, (True = background, False = anomalous). Note, the column names in observed must equal the column names in actual. - tfilter : pandas Series (optional) + tfilter : pandas Series, optional Filter containing boolean values for each time index Returns diff --git a/pecos/monitoring.py b/pecos/monitoring.py index a02bf36..90e4056 100644 --- a/pecos/monitoring.py +++ b/pecos/monitoring.py @@ -5,6 +5,7 @@ """ import pandas as pd import numpy as np +import datetime import logging none_list = ['','none','None','NONE', None, [], {}] @@ -12,7 +13,7 @@ logger = logging.getLogger(__name__) -def _documented_by(original): +def _documented_by(original, include_metadata=False): def wrapper(target): docstring = original.__doc__ old = """ @@ -26,7 +27,16 @@ def wrapper(target): Data used in the quality control test, indexed by datetime """ - new_docstring = docstring.replace(old, new) + \ + if include_metadata: + new_docstring = docstring.replace(old, new) + \ + """ + Returns + ---------- + dictionary + Results include cleaned data, mask, test results summary, and metadata + """ + else: + new_docstring = docstring.replace(old, new) + \ """ Returns ---------- @@ -52,23 +62,27 @@ def __init__(self): 'Start Time', 'End Time', 'Timesteps', 'Error Flag']) + @property + def data(self): + """ + Data used in quality control analysis, added to the PerformanceMonitoring + object using ``add_dataframe``. + """ + return self.df + @property def mask(self): """ - Boolean mask indicating data that failed a quality control test - - Returns - -------- - pandas DataFrame - Boolean values for each data point, - True = data point pass all tests, - False = data point did not pass at least one test (or data is NaN). + Boolean mask indicating if data that failed a quality control test. + True = data point pass all tests, False = data point did not pass at least one test. """ if self.df.empty: logger.info("Empty database") return + # True = pass, False = fail mask = pd.DataFrame(True, index=self.df.index, columns=self.df.columns) + for i in self.test_results.index: variable = self.test_results.loc[i, 'Variable Name'] start_date = self.test_results.loc[i, 'Start Time'] @@ -86,15 +100,10 @@ def mask(self): @property def cleaned_data(self): """ - Cleaned data set - - Returns - -------- - pandas DataFrame - Cleaned data set, data that failed a quality control test are - replaced by NaN + Cleaned data set, data that failed a quality control test are replaced by NaN. """ return self.df[self.mask] + def _setup_data(self, key): """ @@ -107,12 +116,12 @@ def _setup_data(self, key): # Isolate subset if key is not None if key is not None: try: - df = self.df[self.trans[key]] + df = self.df[self.trans[key]] # copy is not needed except: logger.warning("Undefined key: " + key) return else: - df = self.df + df = self.df.copy() return df @@ -124,17 +133,17 @@ def _generate_test_results(self, df, bound, min_failures, error_prefix): # Lower Bound if bound[0] not in none_list: - mask = (df < bound[0]) + mask = ~(df < bound[0]) # True = passed test error_msg = error_prefix+' < lower bound, '+str(bound[0]) self._append_test_results(mask, error_msg, min_failures) # Upper Bound if bound[1] not in none_list: - mask = (df > bound[1]) + mask = ~(df > bound[1]) # True = passed test error_msg = error_prefix+' > upper bound, '+str(bound[1]) self._append_test_results(mask, error_msg, min_failures) - def _append_test_results(self, mask, error_msg, min_failures=1, use_mask_only=False): + def _append_test_results(self, mask, error_msg, min_failures=1, timestamp_test=False): """ Append QC results to the PerformanceMonitoring object. @@ -146,72 +155,65 @@ def _append_test_results(self, mask, error_msg, min_failures=1, use_mask_only=Fa error_msg : string Error message to store with the QC results - min_failures : int (optional) + min_failures : int, optional Minimum number of consecutive failures required for reporting, default = 1 - use_mask_only : boolean (optional) - When True, the mask is used directly to determine test - results and the variable name is not included in the - test_results. When False, the mask is used in combination with - pm.df to extract test results. Default = False + timestamp_test : boolean, optional + When True, the mask comes from a timestamp test, and the variable + name should not be included in the test results """ + if not self.tfilter.empty: - mask[~self.tfilter] = False - if mask.sum(axis=1).sum(axis=0) == 0: + mask[~self.tfilter] = True + + if mask.sum(axis=1).sum(axis=0) == mask.shape[0]*mask.shape[1]: return - - if use_mask_only: - sub_df = mask - else: - sub_df = self.df[mask.columns] - - # Find blocks - order = 'col' - if order == 'col': - mask = mask.T - - np_mask = mask.values + + # The mask is translated and then converted to an np array to improve performace. + # Values are reversed (T/F) to find blocks where quality control tests failed. + np_mask = ~mask.T.values start_nans_mask = np.hstack( - (np.resize(np_mask[:,0],(mask.shape[0],1)), + (np.resize(np_mask[:,0],(mask.shape[1],1)), np.logical_and(np.logical_not(np_mask[:,:-1]), np_mask[:,1:]))) stop_nans_mask = np.hstack( (np.logical_and(np_mask[:,:-1], np.logical_not(np_mask[:,1:])), - np.resize(np_mask[:,-1], (mask.shape[0],1)))) - - start_row_idx,start_col_idx = np.where(start_nans_mask) - stop_row_idx,stop_col_idx = np.where(stop_nans_mask) - - if order == 'col': - temp = start_row_idx; start_row_idx = start_col_idx; start_col_idx = temp - temp = stop_row_idx; stop_row_idx = stop_col_idx; stop_col_idx = temp - #mask = mask.T + np.resize(np_mask[:,-1], (mask.shape[1],1)))) + + start_col_idx, start_row_idx = np.where(start_nans_mask) + stop_col_idx, stop_row_idx = np.where(stop_nans_mask) block = {'Start Row': list(start_row_idx), - 'Start Col': list(start_col_idx), - 'Stop Row': list(stop_row_idx), - 'Stop Col': list(stop_col_idx)} - - #if sub_df is None: - # sub_df = self.df + 'Start Col': list(start_col_idx), + 'Stop Row': list(stop_row_idx), + 'Stop Col': list(stop_col_idx)} + # Extract test results from each block + counter=0 + test_results = {} for i in range(len(block['Start Col'])): - length = block['Stop Row'][i] - block['Start Row'][i] + 1 - if length >= min_failures: - if use_mask_only: + + timesteps = block['Stop Row'][i] - block['Start Row'][i] + 1 + if timesteps >= min_failures: + if timestamp_test: var_name = '' else: - var_name = sub_df.iloc[:,block['Start Col'][i]].name #sub_df.icol(block['Start Col'][i]).name - - frame = pd.DataFrame([var_name, - sub_df.index[block['Start Row'][i]], - sub_df.index[block['Stop Row'][i]], - length, error_msg], - index=['Variable Name', 'Start Time', - 'End Time', 'Timesteps', 'Error Flag']) - self.test_results = self.test_results.append(frame.T, ignore_index=True) - + var_name = mask.iloc[:,block['Start Col'][i]].name + + start_time = mask.index[block['Start Row'][i]] + end_time = mask.index[block['Stop Row'][i]] + + test_results[counter] = {'Variable Name': var_name, + 'Start Time': start_time, + 'End Time': end_time, + 'Timesteps': timesteps, + 'Error Flag': error_msg} + counter = counter + 1 + + test_results = pd.DataFrame(test_results).T + self.test_results = self.test_results.append(test_results, ignore_index=True) + def add_dataframe(self, data): """ Add data to the PerformanceMonitoring object @@ -224,16 +226,14 @@ def add_dataframe(self, data): assert isinstance(data, pd.DataFrame), 'data must be of type pd.DataFrame' assert isinstance(data.index, pd.core.indexes.datetimes.DatetimeIndex), 'data.index must be a DatetimeIndex' - temp = data.copy() - if self.df is not None: - self.df = temp.combine_first(self.df) + self.df = data.combine_first(self.df) else: - self.df = temp + self.df = data.copy() # Add identity 1:1 translation dictionary trans = {} - for col in temp.columns: + for col in data.columns: trans[col] = [col] self.add_translation_dictionary(trans) @@ -262,15 +262,16 @@ def add_time_filter(self, time_filter): ---------- time_filter : pandas DataFrame with a single column or pandas Series Time filter containing boolean values for each time index + True = keep time index in the quality control results. + False = remove time index from the quality control results. """ assert isinstance(time_filter, (pd.Series, pd.DataFrame)), 'time_filter must be of type pd.Series or pd.DataFrame' - if isinstance(time_filter, pd.DataFrame): - self.tfilter = pd.Series(data = time_filter.values[:,0], index = self.df.index) + if isinstance(time_filter, pd.DataFrame) and (time_filter.shape[1] == 1): + self.tfilter = time_filter.squeeze() else: self.tfilter = time_filter - def check_timestamp(self, frequency, expected_start_time=None, expected_end_time=None, min_failures=1, exact_times=True): @@ -283,19 +284,19 @@ def check_timestamp(self, frequency, expected_start_time=None, frequency : int or float Expected time series frequency, in seconds - expected_start_time : Timestamp (optional) + expected_start_time : Timestamp, optional Expected start time. If not specified, the minimum timestamp is used - expected_end_time : Timestamp (optional) + expected_end_time : Timestamp, optional Expected end time. If not specified, the maximum timestamp is used - min_failures : int (optional) + min_failures : int, optional Minimum number of consecutive failures required for reporting, default = 1 - exact_times : bool (optional) + exact_times : bool, optional Controls how missing times are checked. If True, times are expected to occur at regular intervals (specified in frequency) and the DataFrame is reindexed to match @@ -325,14 +326,14 @@ def check_timestamp(self, frequency, expected_start_time=None, # Check to see if timestamp is monotonic # mask = pd.TimeSeries(self.df.index).diff() < 0 - mask = pd.Series(self.df.index).diff() < pd.Timedelta('0 days 00:00:00') + mask = ~(pd.Series(self.df.index).diff() < pd.Timedelta('0 days 00:00:00')) mask.index = self.df.index - mask[mask.index[0]] = False + mask[mask.index[0]] = True mask = pd.DataFrame(mask) mask.columns = [0] self._append_test_results(mask, 'Nonmonotonic timestamp', - use_mask_only=True, + timestamp_test=True, min_failures=min_failures) # If not monotonic, sort df by timestamp @@ -341,9 +342,9 @@ def check_timestamp(self, frequency, expected_start_time=None, # Check for duplicate timestamps # mask = pd.TimeSeries(self.df.index).diff() == 0 - mask = pd.Series(self.df.index).diff() == pd.Timedelta('0 days 00:00:00') + mask = ~(pd.Series(self.df.index).diff() == pd.Timedelta('0 days 00:00:00')) mask.index = self.df.index - mask[mask.index[0]] = False + mask[mask.index[0]] = True mask = pd.DataFrame(mask) mask.columns = [0] mask['TEMP'] = mask.index # remove duplicates in the mask @@ -357,7 +358,7 @@ def check_timestamp(self, frequency, expected_start_time=None, self.df.drop_duplicates(subset='TEMP', keep='first', inplace=True) self._append_test_results(mask, 'Duplicate timestamp', - use_mask_only=True, + timestamp_test=True, min_failures=min_failures) del self.df['TEMP'] @@ -366,19 +367,19 @@ def check_timestamp(self, frequency, expected_start_time=None, missing = temp.difference(self.df.index).tolist() # reindex DataFrame self.df = self.df.reindex(index=rng) - mask = pd.DataFrame(data=self.df.shape[0]*[False], + mask = pd.DataFrame(data=self.df.shape[0]*[True], index=self.df.index) - mask.loc[missing] = True + mask.loc[missing] = False self._append_test_results(mask, 'Missing timestamp', - use_mask_only=True, + timestamp_test=True, min_failures=min_failures) else: # uses pandas >= 0.18 resample syntax df_index = pd.DataFrame(index=self.df.index) df_index[0]=1 # populate with placeholder values - mask = df_index.resample(str(int(frequency*1e3))+'ms').count() == 0 # milliseconds + mask = ~(df_index.resample(str(int(frequency*1e3))+'ms').count() == 0) # milliseconds self._append_test_results(mask, 'Missing timestamp', - use_mask_only=True, + timestamp_test=True, min_failures=min_failures) def check_range(self, bound, key=None, min_failures=1): @@ -391,11 +392,11 @@ def check_range(self, bound, key=None, min_failures=1): [lower bound, upper bound], None can be used in place of a lower or upper bound - key : string (optional) + key : string, optional Data column name or translation dictionary key. If not specified, all columns are used in the test. - min_failures : int (optional) + min_failures : int, optional Minimum number of consecutive failures required for reporting, default = 1 """ @@ -424,17 +425,17 @@ def check_increment(self, bound, key=None, increment=1, absolute_value=True, [lower bound, upper bound], None can be used in place of a lower or upper bound - key : string (optional) + key : string, optional Data column name or translation dictionary key. If not specified, all columns are used in the test. - increment : int (optional) + increment : int, optional Time step shift used to compute difference, default = 1 - absolute_value : boolean (optional) + absolute_value : boolean, optional Use the absolute value of the increment data, default = True - min_failures : int (optional) + min_failures : int, optional Minimum number of consecutive failures required for reporting, default = 1 """ @@ -468,7 +469,7 @@ def check_increment(self, bound, key=None, increment=1, absolute_value=True, self._generate_test_results(df, bound, min_failures, error_prefix) - def check_delta(self, bound, key=None, window=3600, direction=None, + def check_delta(self, bound, window, key=None, direction=None, min_failures=1): """ Check for stagnant data and/or abrupt changes in the data using the @@ -480,15 +481,14 @@ def check_delta(self, bound, key=None, window=3600, direction=None, [lower bound, upper bound], None can be used in place of a lower or upper bound - key : string (optional) + window : int or float + Size of the rolling window (in seconds) used to compute delta + + key : string, optional Data column name or translation dictionary key. If not specified, all columns are used in the test. - window : int or float (optional) - Size of the rolling window (in seconds) used to compute delta, - default = 3600 - - direction : str (optional) + direction : str, optional Options = 'positive', 'negative', or None * If direction is positive, then only identify positive deltas @@ -498,13 +498,13 @@ def check_delta(self, bound, key=None, window=3600, direction=None, * If direction is None, then identify both positive and negative deltas - min_failures : int (optional) + min_failures : int, optional Minimum number of consecutive failures required for reporting, default = 1 """ assert isinstance(bound, list), 'bound must be of type list' - assert isinstance(key, (NoneType, str)), 'key must be None or of type string' assert isinstance(window, (int, float)), 'window must be of type int or float' + assert isinstance(key, (NoneType, str)), 'key must be None or of type string' assert direction in [None, 'positive', 'negative'], "direction must None or the string 'positive' or 'negative'" assert isinstance(min_failures, int), 'min_failures must be of type int' assert self.df.index.is_monotonic, 'index must be monotonic' @@ -529,17 +529,17 @@ def update_mask(mask1, df, window_str, bound, direction): # the final results include actual data points that caused the failure. # This function uses numpy arrays to improve performance and returns # a mask DataFrame. - mask2 = np.zeros((len(mask1.index), len(mask1.columns)), dtype=bool) + mask2 = np.ones((len(mask1.index), len(mask1.columns)), dtype=bool) index = mask1.index # Loop over t, col in mask1 where condition is True - for t,col in list(mask1[mask1 > 0].stack().index): + for t,col in list(mask1[mask1 == 0].stack().index): icol = mask1.columns.get_loc(col) it = mask1.index.get_loc(t) t1 = t-pd.Timedelta(window_str) if (bound == 'lower') and (direction is None): # set the entire time interval to True - mask2[(index >= t1) & (index <= t),icol] = True + mask2[(index >= t1) & (index <= t),icol] = False else: # extract the min and max time @@ -549,20 +549,20 @@ def update_mask(mask1, df, window_str, bound, direction): if bound == 'lower': # bound = upper, direction = positive or negative # set the entire time interval to True if (direction == 'positive') and (min_time <= max_time): - mask2[(index >= t1) & (index <= t),icol] = True + mask2[(index >= t1) & (index <= t),icol] = False elif (direction == 'negative') and (min_time >= max_time): - mask2[(index >= t1) & (index <= t),icol] = True + mask2[(index >= t1) & (index <= t),icol] = False elif bound == 'upper': # bound = upper, direction = None, positive or negative # set the initially flaged location to False - mask2[it,icol] = False + mask2[it,icol] = True # set the time between max/min or min/max to true if min_time < max_time and (direction is None or direction == 'positive'): - mask2[(index >= min_time) & (index <= max_time),icol] = True + mask2[(index >= min_time) & (index <= max_time),icol] = False elif min_time > max_time and (direction is None or direction == 'negative'): - mask2[(index >= max_time) & (index <= min_time),icol] = True + mask2[(index >= max_time) & (index <= min_time),icol] = False elif min_time == max_time: - mask2[it,icol] = True + mask2[it,icol] = False mask2 = pd.DataFrame(mask2, columns=mask1.columns, index=mask1.index) return mask2 @@ -576,24 +576,24 @@ def update_mask(mask1, df, window_str, bound, direction): # Lower Bound if bound[0] not in none_list: - mask = (diff_df < bound[0]) + mask = ~(diff_df < bound[0]) error_msg = error_prefix+' < lower bound, '+str(bound[0]) if not self.tfilter.empty: - mask[~self.tfilter] = False + mask[~self.tfilter] = True mask = update_mask(mask, df, window_str, 'lower', direction) self._append_test_results(mask, error_msg, min_failures) # Upper Bound if bound[1] not in none_list: - mask = (diff_df > bound[1]) + mask = ~(diff_df > bound[1]) error_msg = error_prefix+' > upper bound, '+str(bound[1]) if not self.tfilter.empty: - mask[~self.tfilter] = False + mask[~self.tfilter] = True mask = update_mask(mask, df, window_str, 'upper', direction) self._append_test_results(mask, error_msg, min_failures) - def check_outlier(self, bound, key=None, window=3600, absolute_value=True, + def check_outlier(self, bound, window=None, key=None, absolute_value=False, streaming=False, min_failures=1): """ Check for outliers using normalized data within a rolling window @@ -607,55 +607,84 @@ def check_outlier(self, bound, key=None, window=3600, absolute_value=True, [lower bound, upper bound], None can be used in place of a lower or upper bound - key : string (optional) - Data column name or translation dictionary key. If not specified, - all columns are used in the test. - - window : int or float (optional) + window : int or float, optional Size of the rolling window (in seconds) used to normalize data, - default = 3600. If window is set to None, data is normalized using + If window is set to None, data is normalized using the entire data sets mean and standard deviation (column by column). - - absolute_value : boolean (optional) + default = None. + + key : string, optional + Data column name or translation dictionary key. If not specified, + all columns are used in the test. + + absolute_value : boolean, optional Use the absolute value the normalized data, default = True - - min_failures : int (optional) + + streaming : boolean, optional + Indicates if streaming analysis should be used, default = False + + min_failures : int, optional Minimum number of consecutive failures required for reporting, default = 1 """ assert isinstance(bound, list), 'bound must be of type list' - assert isinstance(key, (NoneType, str)), 'key must be None or of type string' assert isinstance(window, (NoneType, int, float)), 'window must be None or of type int or float' + assert isinstance(key, (NoneType, str)), 'key must be None or of type string' assert isinstance(absolute_value, bool), 'absolute_value must be of type bool' + assert isinstance(streaming, bool), 'streaming must be of type bool' assert isinstance(min_failures, int), 'min_failures must be type int' assert self.df.index.is_monotonic, 'index must be monotonic' + def outlier(data_pt, history): + + mean = history.mean() + std = history.std() + zt = (data_pt - mean)/std + zt.replace([np.inf, -np.inf], np.nan, inplace=True) + + # True = pass, False = fail + if absolute_value: + zt = abs(zt) + + mask = pd.Series(True, index=zt.index) + if bound[0] not in none_list: + mask = mask & (zt >= bound[0]) + if bound[1] not in none_list: + mask = mask & (zt <= bound[1]) + + return mask, zt + logger.info("Check for outliers") df = self._setup_data(key) if df is None: return - - # Compute normalized data - if window is not None: - window_str = str(int(window*1e3)) + 'ms' # milliseconds - df_mean = df.rolling(window_str, min_periods=2, closed='both').mean() - df_std = df.rolling(window_str, min_periods=2, closed='both').std() - df = (df - df_mean)/df_std - else: - df = (df - df.mean())/df.std() - if absolute_value: - df = np.abs(df) - df.replace([np.inf, -np.inf], np.nan, inplace=True) - + if absolute_value: error_prefix = '|Outlier|' else: error_prefix = 'Outlier' - - #df[df.index[0]:df.index[0]+datetime.timedelta(seconds=window)] = np.nan - - self._generate_test_results(df, bound, min_failures, error_prefix) + + if streaming: + metadata = self.check_custom_streaming(outlier, window, rebase=0.5, error_message=error_prefix) + else: + # Compute normalized data + if window is not None: + window_str = str(int(window*1e3)) + 'ms' # milliseconds + df_mean = df.rolling(window_str, min_periods=2, closed='both').mean() + df_std = df.rolling(window_str, min_periods=2, closed='both').std() + df = (df - df_mean)/df_std + else: + df = (df - df.mean())/df.std() + + df.replace([np.inf, -np.inf], np.nan, inplace=True) + + if absolute_value: + df = np.abs(df) + + #df[df.index[0]:df.index[0]+datetime.timedelta(seconds=window)] = np.nan + + self._generate_test_results(df, bound, min_failures, error_prefix) def check_missing(self, key=None, min_failures=1): """ @@ -663,11 +692,11 @@ def check_missing(self, key=None, min_failures=1): Parameters ---------- - key : string (optional) + key : string, optional Data column name or translation dictionary key. If not specified, all columns are used in the test. - min_failures : int (optional) + min_failures : int, optional Minimum number of consecutive failures required for reporting, default = 1 """ @@ -681,12 +710,13 @@ def check_missing(self, key=None, min_failures=1): return # Extract missing data - mask = pd.isnull(df) # checks for np.nan, np.inf + mask = ~pd.isnull(df) # checks for np.nan, np.inf, True = passed test + # Check to see if the missing data was already flagged as a missing timestamp missing_timestamps = self.test_results[ self.test_results['Error Flag'] == 'Missing timestamp'] for index, row in missing_timestamps.iterrows(): - mask.loc[row['Start Time']:row['End Time']] = False + mask.loc[row['Start Time']:row['End Time']] = True self._append_test_results(mask, 'Missing data', min_failures=min_failures) @@ -699,11 +729,11 @@ def check_corrupt(self, corrupt_values, key=None, min_failures=1): corrupt_values : list of int or floats List of corrupt data values - key : string (optional) + key : string, optional Data column name or translation dictionary key. If not specified, all columns are used in the test. - min_failures : int (optional) + min_failures : int, optional Minimum number of consecutive failures required for reporting, default = 1 """ @@ -718,14 +748,148 @@ def check_corrupt(self, corrupt_values, key=None, min_failures=1): return # Extract corrupt data - mask = pd.DataFrame(data = np.zeros(df.shape), index = df.index, columns = df.columns, dtype = bool) # all False - for i in corrupt_values: - mask = mask | (df == i) - self.df[mask] = np.nan + mask = ~df.isin(corrupt_values) # True = passed test + + # Replace corrupt data with NaN + self.df[~mask] = np.nan self._append_test_results(mask, 'Corrupt data', min_failures=min_failures) + def check_custom_static(self, quality_control_func, key=None, min_failures=1, + error_message=None): + """ + Use custom functions that operate on the entire dataset at once to + perform quality control analysis + + Parameters + ---------- + quality_control_func : function + Function that operates on self.df and returns a mask and metadata + + key : string, optional + Data column name or translation dictionary key. If not specified, + all columns are used in the test. + + min_failures : int, optional + Minimum number of consecutive failures required for reporting, + default = 1 + + error_message : str, optional + Error message + """ + assert callable(quality_control_func), 'quality_control_func must be a callable function' + assert isinstance(key, (NoneType, str)), 'key must be None or of type string' + assert isinstance(min_failures, int), 'min_failures must be type int' + assert isinstance(error_message, (NoneType, str)), 'error_message must be None or of type string' + + df = self._setup_data(key) + if df is None: + return + + # Function that operates on the entire dataset and returns a mask and + # metadata for the entire dataset + mask, metadata = quality_control_func(self.df) + assert isinstance(mask, pd.DataFrame), 'mask returned by quality_control_func must be of type pd.DataFrame' + assert isinstance(metadata, pd.DataFrame), 'metadata returned by quality_control_func must be of type pd.DataFrame' + + # Function that modifies the mask + #if post_process_func is not None: + # mask = post_process_func(mask) + + self._append_test_results(mask, error_message, min_failures) + + return metadata + + def check_custom_streaming(self, quality_control_func, window, key=None, + rebase=None, min_failures=1, error_message=None): + """ + Check for anomolous data using a streaming framework which removes + anomolous data from the history after each timestamp. A custom quality + control function is supplied by the user to determine if the data is anomolous. + + Parameters + ---------- + quality_control_func : function + Function that determines if the last data point is normal or anomalous. + Returns a mask and metadata for the last data point. + + window : int or float + Size of the rolling window (in seconds) used to define history + If window is set to None, data is normalized using + the entire data sets mean and standard deviation (column by column). + + key : string, optional + Data column name or translation dictionary key. If not specified, + all columns are used in the test. + + rebase : int, float, or None + Value between 0 and 1 that indicates the fraction of + default = None. + min_failures : int, optional + Minimum number of consecutive failures required for reporting, + default = 1 + + error_message : str, optional + Error message + """ + assert callable(quality_control_func), 'quality_control_func must be a callable function' + assert isinstance(window, (int, float)), 'window must be of type int or float' + assert isinstance(key, (NoneType, str)), 'key must be None or of type string' + assert isinstance(rebase, (NoneType, int, float)), 'rebase must be None or type int or float' + assert isinstance(min_failures, int), 'min_failures must be type int' + assert isinstance(error_message, (NoneType, str)), 'error_message must be None or of type string' + + df = self._setup_data(key) + if df is None: + return + + metadata = {} + rebase_count = 0 + history_window = datetime.timedelta(seconds=window) + + # The mask must be the same size as data + # The streaming framework uses numpy arrays to improve performance but + # still expects pandas DataFrames and Series in the user defined quality + # control function to keep data types consitent on the user side. + np_mask = pd.DataFrame(True, index=self.df.index, columns=self.df.columns).values + np_data = df.values.astype('Float64') + + ti = df.index.get_loc(df.index[0]+history_window) + + for i, t in enumerate(np.arange(ti,np_data.shape[0],1)): + + t_start = df.index.get_loc(df.index[t]-history_window, method='nearest') + t_timestamp = df.index[t] + + data_pt = pd.Series(np_data[t], index=df.columns) + history = pd.DataFrame(np_data[t_start:t], index=range(t-t_start), columns=df.columns) + + mask_t, metadata[t_timestamp] = quality_control_func(data_pt, history) + if i == 0: + assert isinstance(mask_t, pd.Series), 'mask returned by quality_control_func must be of type pd.Series' + assert isinstance(metadata[t_timestamp], pd.Series), 'metadata returned by quality_control_func must be of type pd.Series' + + np_mask[t] = mask_t.values + np_data[~np_mask] = np.NAN + + # rebase + if rebase is not None: + data_history = np_data[t_start:t+1] # +1 so it includes history and current data point + check_rebase = np.isnan(data_history).sum(axis=0)/data_history.shape[0] > rebase + if sum(check_rebase) > 0: + np_data[t][check_rebase] = df.iloc[t][check_rebase] + rebase_count = rebase_count + sum(check_rebase) + + mask = pd.DataFrame(np_mask, index=self.df.index, columns=self.df.columns) + self._append_test_results(mask, error_message, min_failures) + + # Convert metadata to a dataframe + metadata = pd.DataFrame(metadata).T + + return metadata + + ### Functional approach @_documented_by(PerformanceMonitoring.check_timestamp) def check_timestamp(data, frequency, expected_start_time=None, @@ -737,7 +901,7 @@ def check_timestamp(data, frequency, expected_start_time=None, min_failures, exact_times) mask = pm.mask - return {'cleaned_data': pm.df, 'mask': mask, 'test_results': pm.test_results} + return {'cleaned_data': pm.data, 'mask': mask, 'test_results': pm.test_results} @_documented_by(PerformanceMonitoring.check_range) @@ -764,23 +928,23 @@ def check_increment(data, bound, key=None, increment=1, absolute_value=True, @_documented_by(PerformanceMonitoring.check_delta) -def check_delta(data, bound, key=None, window=3600, direction=None, min_failures=1): +def check_delta(data, bound, window, key=None, direction=None, min_failures=1): pm = PerformanceMonitoring() pm.add_dataframe(data) - pm.check_delta(bound, key, window, direction, min_failures) + pm.check_delta(bound, window, key, direction, min_failures) mask = pm.mask return {'cleaned_data': data[mask], 'mask': mask, 'test_results': pm.test_results} @_documented_by(PerformanceMonitoring.check_outlier) -def check_outlier(data, bound, key=None, window=3600, absolute_value=True, - min_failures=1): +def check_outlier(data, bound, window=None, key=None, absolute_value=False, + streaming=False, min_failures=1): pm = PerformanceMonitoring() pm.add_dataframe(data) - pm.check_outlier(bound, key, window, absolute_value, min_failures) + pm.check_outlier(bound, window, key, absolute_value, streaming, min_failures) mask = pm.mask return {'cleaned_data': data[mask], 'mask': mask, 'test_results': pm.test_results} @@ -806,3 +970,27 @@ def check_corrupt(data, corrupt_values, key=None, min_failures=1): mask = pm.mask return {'cleaned_data': data[mask], 'mask': mask, 'test_results': pm.test_results} + +@_documented_by(PerformanceMonitoring.check_custom_static, include_metadata=True) +def check_custom_static(data, quality_control_func, key=None, min_failures=1, + error_message=None): + + pm = PerformanceMonitoring() + pm.add_dataframe(data) + metadata = pm.check_custom_static(quality_control_func, key, min_failures, error_message) + mask = pm.mask + + return {'cleaned_data': data[mask], 'mask': mask, 'test_results': pm.test_results, + 'metadata': metadata} + +@_documented_by(PerformanceMonitoring.check_custom_streaming, include_metadata=True) +def check_custom_streaming(data, quality_control_func, window, key=None, rebase=None, + min_failures=1, error_message=None): + + pm = PerformanceMonitoring() + pm.add_dataframe(data) + metadata = pm.check_custom_streaming(quality_control_func, window, key, rebase, min_failures, error_message) + mask = pm.mask + + return {'cleaned_data': data[mask], 'mask': mask, 'test_results': pm.test_results, + 'metadata': metadata} \ No newline at end of file diff --git a/pecos/pv.py b/pecos/pv.py index bd96380..cf3d33b 100644 --- a/pecos/pv.py +++ b/pecos/pv.py @@ -24,7 +24,7 @@ def insolation(G, tfilter=None): G : pandas DataFrame Irradiance time series - tfilter : pandas Series (optional) + tfilter : pandas Series, optional Time filter containing boolean values for each time index Returns @@ -54,7 +54,7 @@ def energy(P, tfilter=None): P : pandas DataFrame Power time series - tfilter : pandas Series (optional) + tfilter : pandas Series, optional Time filter containing boolean values for each time index Returns @@ -90,7 +90,7 @@ def performance_ratio(E, H_poa, P_ref, G_ref=1000): P_ref : float DC power rating at STC conditions - G_ref : float (optional) + G_ref : float, optional Reference irradiance, default = 1000 Returns @@ -129,7 +129,7 @@ def normalized_current(I, G_poa, I_sco, G_ref=1000): I_sco : float Short circuit current at STC conditions - G_ref : float (optional) + G_ref : float, optional Reference irradiance, default = 1000 Returns @@ -168,7 +168,7 @@ def normalized_efficiency(P, G_poa, P_ref, G_ref=1000): P_ref : float DC power rating at STC conditions - G_ref : float (optional) + G_ref : float, optional Reference irradiance, default = 1000 Returns diff --git a/pecos/templates/base.html b/pecos/templates/base.html index f4f68c4..a156d3e 100644 --- a/pecos/templates/base.html +++ b/pecos/templates/base.html @@ -7,8 +7,8 @@ {% block body %} -
This report was generated by Pecos - {{ version }}, {{ datestr }} +
Report generated by Pecos
+ Version {{ version }}, Date {{ datestr }} {% endblock %} \ No newline at end of file diff --git a/pecos/templates/monitoring_report.html b/pecos/templates/monitoring_report.html index 6a69232..c27bf04 100644 --- a/pecos/templates/monitoring_report.html +++ b/pecos/templates/monitoring_report.html @@ -13,7 +13,9 @@ {% if logo %} - Logo +

Logo {{ title }}

+{% else %} +

{{ title }}

{% endif %} @@ -21,12 +23,11 @@
-

{{ title }}

-Start time: {{ content['start_time'] }}
-End time: {{ content['end_time'] }}
-Test Failures: {{ content['num_test_results'] }}
-Notes: {{ content['num_notes'] }}
+Data start time: {{ content['start_time'] }}
+Data end time: {{ content['end_time'] }}
+Number of variables: {{ content['num_data_columns']}}
+Number of test failures: {{ content['num_test_results'] }}

{% for im in content['custom_graphics'] %} @@ -62,8 +63,6 @@

Test Results:

Notes:

{{ content['notes'] }}
-{% else %} -

Notes:

None

{% endif %} {% if content['config'] %} diff --git a/pecos/templates/monitoring_report.tex b/pecos/templates/monitoring_report.tex new file mode 100644 index 0000000..7d765c3 --- /dev/null +++ b/pecos/templates/monitoring_report.tex @@ -0,0 +1,86 @@ +\documentclass[letterpaper, 11pt]{article} + +\usepackage[margin=0.5in]{geometry} +\usepackage{graphicx} +\usepackage{booktabs} +\usepackage{array} +\usepackage[hidelinks]{hyperref} +\usepackage{longtable} +\usepackage{placeins} + +\setlength\parindent{0pt} +\hypersetup{colorlinks=false} +\maxdeadcycles=200 + +\begin{document} + +{% if logo %} +\begin{minipage}[c]{\linewidth} + {\includegraphics[width={{ im_width_logo }}\linewidth]{% raw -%}{{%- endraw %}{{logo}}{% raw -%}}{%- endraw %} } + \textbf{\Large{ {{title}} } } +\end{minipage}\hfill +{% else %} \begin{minipage}[c]{\linewidth} + \textbf{\Large{ {{title}} } } +\end{minipage}\hfill +{% endif %} + +\underline{\hspace{\linewidth}} +\vspace{0.05in} + +Data start time: {{ content['start_time']}} + +Data end time: {{ content['end_time']}} + +Number of variables: {{ content['num_data_columns']}} + +Number of test failures: {{ content['num_test_results']}} + +{% for im in content['custom_graphics'] %} + \begin{figure}[h!] + \includegraphics[width={{ im_width_custom }}\linewidth]{% raw -%}{{%- endraw %}{{im}}{% raw -%}}{%- endraw %} + \end{figure} +{% endfor %} + +\FloatBarrier + +{% if content['num_metrics']|int > 0 %} + \textbf{\Large{Performance Metrics}} + {{ content['metrics'] }} +{% endif %} + +\textbf{\Large{Test Results}} + +\vspace{0.1in} +\small{\centering{ +{{ content['test_results'] }} +}} + +{% for im in content['test_results_graphics'] %} + \begin{figure}[h!] + \includegraphics[width={{ im_width_test_results }}\linewidth]{% raw -%}{{%- endraw %}{{im}}{% raw -%}}{%- endraw %} + \end{figure} +{% endfor %} + +\FloatBarrier + +{% if content['num_notes']|int > 0 %} + \textbf{\Large{Notes}} + {{ content['notes'] }} +{% endif %} + +{% if content['config'] %} + \textbf{\Large{Configuration Options}} + {{ content['config'] }} +{% endif %} + +\vspace{0.05in} +\underline{\hspace{\linewidth}} + +\footnotesize{ +Report generated by \href{https://pecos.readthedocs.io/}{\includegraphics[height=0.26cm]{% raw -%}{{%- endraw %}{{content['pecos_logo']}}{% raw -%}}{%- endraw %} } + +Version {{ version }}, Date {{ datestr }} } + +\end{document} + + diff --git a/pecos/tests/test_metrics.py b/pecos/tests/test_metrics.py index 06dcdf8..f301f1d 100644 --- a/pecos/tests/test_metrics.py +++ b/pecos/tests/test_metrics.py @@ -1,5 +1,5 @@ from nose.tools import * -from pandas.util.testing import assert_frame_equal +from pandas.testing import assert_frame_equal from os.path import abspath, dirname, join import pecos import numpy as np diff --git a/pecos/tests/test_monitoring.py b/pecos/tests/test_monitoring.py index 7da06cb..599d43e 100644 --- a/pecos/tests/test_monitoring.py +++ b/pecos/tests/test_monitoring.py @@ -4,7 +4,7 @@ import pecos import pandas as pd from pandas import Timestamp, RangeIndex -from pandas.util.testing import assert_frame_equal, assert_series_equal +from pandas.testing import assert_frame_equal, assert_series_equal import numpy as np from numpy import array @@ -52,9 +52,9 @@ def simple_example_run_analysis(df): pm.check_range([None, 0.25], 'Wave Error') # Check for stagnant data within a 1 hour moving window - pm.check_delta([0.0001, None], 'A', 3600) - pm.check_delta([0.0001, None], 'B', 3600) - pm.check_delta([0.0001, None], 'Wave', 3600) + pm.check_delta([0.0001, None], 3600, 'A') + pm.check_delta([0.0001, None], 3600, 'B') + pm.check_delta([0.0001, None], 3600, 'Wave') # Check for abrupt changes between consecutive time steps pm.check_increment([None, 0.6], 'Wave') @@ -221,8 +221,8 @@ def test_check_delta(self): # Object-oriented test self.pm.check_corrupt([-999]) - self.pm.check_delta([0.0001, None], window=2*3600) - self.pm.check_delta([None, 0.6], 'Wave', window=900) + self.pm.check_delta([0.0001, None], 2*3600) + self.pm.check_delta([None, 0.6], 900, 'Wave') test_results = self.pm.test_results[['Delta' in ef for ef in self.pm.test_results['Error Flag']]] #pecos.graphics.plot_test_results(self.pm.df, self.pm.test_results, filename_root='test_check_delta') @@ -543,6 +543,105 @@ def test_outlier(self): index=RangeIndex(start=0, stop=2, step=1) ) assert_frame_equal(expected, self.pm.test_results) + + # Functional tests + results = pecos.monitoring.check_outlier(self.pm.data, [None, 1.9], window=None, absolute_value=True ) + test_results = results['test_results'] + expected = pd.DataFrame( + array([['A', Timestamp('2017-01-01 06:00:00'), Timestamp('2017-01-01 06:00:00'), 1, '|Outlier| > upper bound, 1.9'], + ['A', Timestamp('2017-01-01 19:00:00'), Timestamp('2017-01-01 19:00:00'), 1, '|Outlier| > upper bound, 1.9']], dtype=object), + columns=['Variable Name', 'Start Time', 'End Time', 'Timesteps', 'Error Flag'], + index=RangeIndex(start=0, stop=2, step=1) + ) + assert_frame_equal(test_results, expected, + check_dtype=False) + + + def test_outlier_streaming(self): + # outlier if stdev > 1.9 + pass + +class Test_check_custom(unittest.TestCase): + + @classmethod + def setUp(self): + N = 1000 + np.random.seed(92837) + index = pd.date_range('1/1/2020', periods=N, freq='S') + data = {'A': np.random.normal(size=N),'B': np.random.normal(size=N)} + df = pd.DataFrame(data, index=index) + + self.pm = pecos.monitoring.PerformanceMonitoring() + self.pm.add_dataframe(df) + + @classmethod + def tearDown(self): + pass + + def test_custom_static(self): + + def custom_func(data): + mask = (data.abs() < 2) + metadata = data + return mask, metadata + + metadata = self.pm.check_custom_static(custom_func, error_message='Static') + N = self.pm.df.shape[0]*self.pm.df.shape[1] + percent = 1-self.pm.test_results['Timesteps'].sum()/N + assert_almost_equal(percent, 0.95, 2) # 95% within 2 std + + # Functional tests + results = pecos.monitoring.check_custom_static(self.pm.data, custom_func, error_message='Static') + percent = 1-results['test_results']['Timesteps'].sum()/N + assert_almost_equal(percent, 0.95, 2) # 95% within 2 std + + def test_custom_streaming(self): + + def custom_func(data_pt, history): + mask = (data_pt.abs() < 2) + metadata = data_pt + return mask, metadata + + metadata = self.pm.check_custom_streaming(custom_func, 50, error_message='Streaming') + N = self.pm.df.shape[0]*self.pm.df.shape[1] + percent = 1-self.pm.test_results['Timesteps'].sum()/N + assert_almost_equal(percent, 0.95, 2) # 95% within 2 std + + # Functional tests + results = pecos.monitoring.check_custom_streaming(self.pm.data, custom_func, 50, error_message='Streaming') + percent = 1-results['test_results']['Timesteps'].sum()/N + assert_almost_equal(percent, 0.95, 2) # 95% within 2 std + +class Test_append_test_results(unittest.TestCase): + + @classmethod + def setUp(self): + self.pm = pecos.monitoring.PerformanceMonitoring() + + @classmethod + def tearDown(self): + pass + + def test_append_test_results(self): + mask = pd.DataFrame(True, columns=['A', 'B', 'C', 'D', 'E'], index=range(10)) + mask.loc[0:3,'A'] = False # start of time series + mask.loc[5,'A'] = False # single time + mask.loc[7:9,'B'] = False # end of a column + mask.loc[0:5,'C'] = False # wrap False across two columns + mask.loc[8:9,'E'] = False # end of time series + + self.pm._append_test_results(mask, 'None') + + expected = pd.DataFrame( + array([['A', 0, 3, 4, 'None'], + ['A', 5, 5, 1, 'None'], + ['B', 7, 9, 3, 'None'], + ['C', 0, 5, 6, 'None'], + ['E', 8, 9, 2, 'None']], dtype=object), + columns=['Variable Name', 'Start Time', 'End Time', 'Timesteps', 'Error Flag'], + index=range(5)) + assert_frame_equal(expected, self.pm.test_results) + if __name__ == '__main__': unittest.main() diff --git a/pecos/tests/test_pv.py b/pecos/tests/test_pv.py index 5fc4f36..a33df4f 100644 --- a/pecos/tests/test_pv.py +++ b/pecos/tests/test_pv.py @@ -1,5 +1,5 @@ from nose.tools import * -from pandas.util.testing import assert_frame_equal, assert_series_equal +from pandas.testing import assert_frame_equal, assert_series_equal from os.path import abspath, dirname, join import pandas as pd import numpy as np diff --git a/pecos/tests/test_utils.py b/pecos/tests/test_utils.py index ba88072..68ef01a 100644 --- a/pecos/tests/test_utils.py +++ b/pecos/tests/test_utils.py @@ -2,7 +2,7 @@ import sys from nose import SkipTest from nose.tools import * -from pandas.util.testing import assert_frame_equal, assert_index_equal +from pandas.testing import assert_frame_equal, assert_index_equal import pandas as pd import numpy as np import pecos diff --git a/pecos/utils.py b/pecos/utils.py index 5002675..72cdc04 100644 --- a/pecos/utils.py +++ b/pecos/utils.py @@ -18,7 +18,7 @@ def index_to_datetime(index, unit='s', origin='unix'): index : pandas Index DataFrame index in int or float - unit : str (optional) + unit : str, optional Units of the original index origin : str @@ -111,7 +111,7 @@ def round_index(index, frequency, how='nearest'): frequency : int Expected time series frequency, in seconds - how : string (optional) + how : string, optional Method for rounding, default = 'nearest'. Options include: * nearest = round the index to the nearest frequency @@ -168,13 +168,17 @@ def evaluate_string(string_to_eval, data=None, trans=None, specs=None, col_name= string_to_eval : string String to evaluate, the string can included multiple keywords and numpy (np.*) and pandas (pd.*) functions - data : pandas DataFrame (optional) + + data : pandas DataFrame, optional Data, indexed by datetime - trans: dictionary (optional) + + trans: dictionary, optional Translation dictionary - specs : dictionary (optional) + + specs : dictionary, optional Keyword:value pairs used to define constants - col_name : string (optional) + + col_name : string, optional Column name used in the returned DataFrame. If the DataFrame has more than one column, columns are named col_name 0, col_name 1, ...