Skip to content

Commit

Permalink
Update the time shift Ruptures function with the new logic that was t…
Browse files Browse the repository at this point in the history
…ested/validated at PVRW 2023 (#197)

* update with new tested logic

* variable renaming

* more variable renaming

* fixed binseg call

* amended .loc error

* update the function after pipeline testing

* fixed the subtraction to put everything in minutes

* update unit tests

* removed rounding function from unit tests

* added to whatsnew + added to function docstring

* added the pull number to whatsnew

* fixed index issue

* update to remove the rounding function as we don't use it anymore

* updated from pelt to binseg reference
kperrynrel authored Dec 22, 2023
1 parent 904603d commit e7aeb91
Showing 3 changed files with 88 additions and 93 deletions.
6 changes: 6 additions & 0 deletions docs/whatsnew/0.2.0.rst
Original file line number Diff line number Diff line change
@@ -11,6 +11,12 @@ Breaking Changes
accounting for array incidence loss (IAM), and including losses in the PVWatts
inverter model. Additionally, added optional arguments for bounding the azimuth range in
during least squares optimization. (:issue:`147`, :pull:`180`)
* Updated function :py:func:`~pvanalytics.quality.time.shifts_ruptures` to align with the
methodology tested and reported on at PVRW 2023 ("Survey of Time Shift Detection Algorithms
for Measured PV Data"). This includes converting the changepoint detection algorithm from
Pelt to Binary Segmentation (which runs much faster), and performing additional processing
to each detected segment to remove outliers and filter by a quantile cutoff instead of the
original rounding technique. (:pull:`197`)


Enhancements
126 changes: 67 additions & 59 deletions pvanalytics/quality/time.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Quality tests related to time."""
import warnings
import pandas as pd
import numpy as np
from pvanalytics.quality.outliers import zscore


def spacing(times, freq):
@@ -37,38 +37,23 @@ def spacing(times, freq):
return delta == freq


def _round_multiple(x, to, up_from=None):
# Return `x` rounded to the nearest multiple of `to`
#
# Parameters
# ----------
# x : Series
# to : int
# `x` is rounded to a multiple of `to`
# up_from : int, optional
# If the remainder of `x` / `to` is greater than `up_from`
# then `x` is rounded up, otherwise `x` is rounded down.
# If not specified rounding will go up from `to` // 2.
up_from = up_from or to // 2
quotient, remainder = divmod(abs(x), to)
remainder[remainder > up_from] = to
remainder[remainder <= up_from] = 0
return np.sign(x) * (quotient*to + remainder)


def shifts_ruptures(event_times, reference_times, period_min=2,
shift_min=15, round_up_from=None,
prediction_penalty=13):
def shifts_ruptures(event_times, reference_times,
period_min=15,
shift_min=15,
prediction_penalty=20,
zscore_cutoff=2,
bottom_quantile_threshold=0,
top_quantile_threshold=.5):
"""Identify time shifts using the ruptures library.
Compares the event time in the expected time zone (`reference_times`)
with the actual event time in `event_times`.
The Pelt changepoint detection method is applied to the difference
between `event_times` and `reference_times`. For each period between
change points the mode of the difference is rounded to a multiple of
`shift_min` and returned as the time-shift for all days in that
period.
The Binary Segmentation changepoint detection method is applied to the
difference between `event_times` and `reference_times`. For each period
between change points the mode of the difference is rounded to a
multiple of `shift_min` and returned as the time-shift for all days in
that period.
Parameters
----------
@@ -81,27 +66,32 @@ def shifts_ruptures(event_times, reference_times, period_min=2,
timezone. For example, passing solar transit time in a fixed offset
time zone can be used to detect daylight savings shifts when it is
unknown whether or not `event_times` is in a fixed offset time zone.
period_min : int, default 2
period_min : int, default 15
Minimum number of days between shifts. Must be less than or equal to
the number of days in `event_times`. [days]
Increasing this parameter will make the result less sensitive to
transient shifts. For example if your intent is to find and correct
daylight savings time shifts passing `period_min=60` can give good
results while excluding shorter periods that appear shifted.
shift_min : int, default 15
Minimum shift amount in minutes. All shifts are rounded to a multiple
of `shift_min`. [minutes]
round_up_from : int, optional
The number of minutes greater than a multiple of `shift_min` for a
shift to be rounded up. If a shift is less than `round_up_from` then
it will be rounded towards 0. If not specified then the shift will
be rounded up from `shift_min // 2`. Using a larger value will
effectively make the shift detection more conservative as small
variations will tend to be rounded to zero. [minutes]
prediction_penalty : int, default 13
Penalty used in assessing change points.
See :py:meth:`ruptures.detection.Pelt.predict` for more information.
See :py:meth:`ruptures.detection.Binseg.predict` for more information.
zscore_cutoff : int, default 2
Z-score cutoff / maximum for filtering out outliers in each identified
segment found via changepount detection
bottom_quantile_threshold : float, default 0
Bottom quantile threshold for each time series segment identified via
changepoint detection. All data below this threshold is not considered
when determining the mean value for the segment, which is later
rounded to the nearest `period_min` value
top_quantile_threshold : float, default 0.5
Top quantile threshold for each time series segment identified via
changepoint detection. All data above this threshold is not considered
when determining the mean value for the segment, which is later
rounded to the nearest `period_min` value
Returns
-------
@@ -141,6 +131,11 @@ def shifts_ruptures(event_times, reference_times, period_min=2,
Derived from the PVFleets QA project.
References
-------
.. [1] Perry K., Meyers B., and Muller, M. "Survey of Time Shift
Detection Algorithms for Measured PV Data", 2023 PV Reliability
Workshop (PVRW).
"""
try:
import ruptures
@@ -150,35 +145,48 @@ def shifts_ruptures(event_times, reference_times, period_min=2,
if period_min > len(event_times):
raise ValueError("period_min exceeds number of days in event_times")
# Drop timezone information. At this point there is one value per day
# so the timezone is irrelevant.
# so the timezone is irrelevant. Get the time difference in minutes.
time_diff = \
event_times.tz_localize(None) - reference_times.tz_localize(None)
break_points = ruptures.Pelt(
model='rbf',
jump=1,
min_size=period_min
).fit_predict(
signal=time_diff.values,
pen=prediction_penalty
)
(event_times.tz_localize(None) -
reference_times.tz_localize(None))
# Get the index before removing NaN's
time_diff_orig_index = time_diff.index
# Remove NaN's from the time_diff series, because NaN's screw up the
# ruptures prediction
time_diff = time_diff.dropna()
# Run changepoint detection to find breaks
break_points = ruptures.Binseg(model='rbf',
jump=1,
min_size=period_min).fit_predict(
signal=time_diff.values,
pen=prediction_penalty)
# Make sure the entire series is covered by the intervals between
# the breakpoints that were identified above. This means adding a
# breakpoint at the beginning of the series (0) and at the end if
# one does not already exist.
break_points.insert(0, 0)
if break_points[-1] != len(time_diff):
break_points.append(len(time_diff))
time_diff = _round_multiple(time_diff, shift_min, round_up_from)
shift_amount = time_diff.groupby(
pd.cut(
time_diff.reset_index().index,
break_points,
include_lowest=True, right=False,
duplicates='drop'
)
).transform(
lambda shifted_period: shifted_period.mode().values[0]
)
# Go through each time shift segment and perform the following steps:
# 1) Remove the outliers from each segment using a z-score filter
# 2) Remove any cases that are not within the bottom and top quantile
# parameters for the segment. This helps to remove more erroneous
# data.
# 3) Take the mean of each segment and round it to the nearest shift_min
# multiplier
shift_amount = time_diff.copy()
for index in range(len(break_points) - 1):
segment = time_diff[break_points[index]:
break_points[index + 1]]
segment = segment[~zscore(segment, zmax=zscore_cutoff,
nan_policy='omit')]
segment = segment[
(segment >= segment.quantile(bottom_quantile_threshold)) &
(segment <= segment.quantile(top_quantile_threshold))]
shift_amount.iloc[break_points[index]: break_points[index + 1]] = \
shift_min * round(float(segment.mean())/shift_min)
# Update the shift_amount series with the original time series index
shift_amount = shift_amount.reindex(time_diff_orig_index).ffill()
# localize the shift_amount series to the timezone of the input
shift_amount = shift_amount.tz_localize(event_times.index.tz)
return shift_amount != 0, shift_amount
49 changes: 15 additions & 34 deletions pvanalytics/tests/quality/test_time.py
Original file line number Diff line number Diff line change
@@ -196,7 +196,7 @@ def test_has_dst_no_dst_in_date_range(albuquerque):
def midday(request, albuquerque):
solar_position = albuquerque.get_solarposition(
pd.date_range(
start='1/1/2020', end='2/29/2020 23:59',
start='1/1/2020', end='3/30/2020 23:59',
tz='MST', freq=request.param
)
)
@@ -233,10 +233,10 @@ def test_shift_ruptures_positive_shift(midday):
shifted = _shift_between(
midday, 60,
start='2020-01-01',
end='2020-02-29'
end='2020-03-30'
)
expected_shift_mask = pd.Series(False, index=midday.index)
expected_shift_mask['2020-01-01':'2020-02-29'] = True
expected_shift_mask['2020-01-01':'2020-03-30'] = True
shift_mask, shift_amounts = time.shifts_ruptures(shifted, midday)
assert_series_equal(shift_mask, expected_shift_mask, check_names=False)
assert_series_equal(
@@ -251,10 +251,10 @@ def test_shift_ruptures_negative_shift(midday):
shifted = _shift_between(
midday, -60,
start='2020-01-01',
end='2020-02-29'
end='2020-03-30'
)
expected_shift_mask = pd.Series(False, index=midday.index)
expected_shift_mask['2020-01-01':'2020-02-29'] = True
expected_shift_mask['2020-01-01':'2020-03-30'] = True
shift_mask, shift_amounts = time.shifts_ruptures(shifted, midday)
assert_series_equal(shift_mask, expected_shift_mask, check_names=False)
assert_series_equal(
@@ -294,9 +294,11 @@ def _shift_between(series, shift, start, end):
@requires_ruptures
def test_shift_ruptures_period_min(midday):
no_shifts = pd.Series(0, index=midday.index, dtype='int64')
# period_min must be equal to length of series / 2 or less in order for
# binary segmentation algoritm to work.
shift_mask, shift_amount = time.shifts_ruptures(
midday, midday,
period_min=len(midday)
period_min=len(midday) / 2
)
assert not shift_mask.any()
assert_series_equal(
@@ -344,10 +346,10 @@ def test_shifts_ruptures_shift_at_end(midday):
shifted = _shift_between(
midday, 60,
start='2020-02-01',
end='2020-02-29'
end='2020-03-30'
)
shift_expected = pd.Series(0, index=shifted.index, dtype='int64')
shift_expected['2020-02-02':'2020-02-29'] = 60
shift_expected['2020-02-02':'2020-03-30'] = 60
shift_mask, shift_amount = time.shifts_ruptures(shifted, midday)
assert_series_equal(shift_mask, shift_expected != 0, check_names=False)
assert_series_equal(
@@ -362,11 +364,12 @@ def test_shifts_ruptures_shift_in_middle(midday):
shifted = _shift_between(
midday, 60,
start='2020-01-25',
end='2020-02-15'
end='2020-03-05'
)
shift_expected = pd.Series(0, index=shifted.index, dtype='int64')
shift_expected['2020-01-26':'2020-02-15'] = 60
shift_mask, shift_amount = time.shifts_ruptures(shifted, midday)
shift_expected['2020-01-26':'2020-03-05'] = 60
shift_mask, shift_amount = time.shifts_ruptures(shifted, midday,
prediction_penalty=13)
assert_series_equal(
shift_mask,
shift_expected != 0,
@@ -391,7 +394,7 @@ def test_shift_ruptures_shift_min(midday):
no_shift = pd.Series(0, index=shifted.index, dtype='int64')
shift_mask, shift_amount = time.shifts_ruptures(
shifted, midday,
shift_min=60, round_up_from=40
shift_min=60
)
assert not shift_mask.any()
assert_series_equal(
@@ -482,25 +485,3 @@ def test_dst_dates(timezone, expected_dates):
time.dst_dates(index.tz_localize(None), timezone),
expected.tz_localize(None)
)


def test_rounding():
xs = pd.Series(
[-10, 10, -16, 16, -28, 28, -30, 30, -8, 8, -7, 7, -3, 3, 0]
)
assert_series_equal(
time._round_multiple(xs, 15),
pd.Series([-15, 15, -15, 15, -30, 30, -30, 30, -15, 15, 0, 0, 0, 0, 0])
)
assert_series_equal(
time._round_multiple(xs, 15, up_from=9),
pd.Series([-15, 15, -15, 15, -30, 30, -30, 30, 0, 0, 0, 0, 0, 0, 0])
)
assert_series_equal(
time._round_multiple(xs, 15, up_from=15),
pd.Series([0, 0, -15, 15, -15, 15, -30, 30, 0, 0, 0, 0, 0, 0, 0])
)
assert_series_equal(
time._round_multiple(xs, 30),
pd.Series([0, 0, -30, 30, -30, 30, -30, 30, 0, 0, 0, 0, 0, 0, 0])
)

0 comments on commit e7aeb91

Please sign in to comment.