Update the time shift Ruptures function with the new logic that was t…

…ested/validated at PVRW 2023 (#197) * update with new tested logic * variable renaming * more variable renaming * fixed binseg call * amended .loc error * update the function after pipeline testing * fixed the subtraction to put everything in minutes * update unit tests * removed rounding function from unit tests * added to whatsnew + added to function docstring * added the pull number to whatsnew * fixed index issue * update to remove the rounding function as we don't use it anymore * updated from pelt to binseg reference
pvlib · Dec 22, 2023 · e7aeb91 · e7aeb91
1 parent 904603d
commit e7aeb91
Showing 3 changed files with 88 additions and 93 deletions.
diff --git a/docs/whatsnew/0.2.0.rst b/docs/whatsnew/0.2.0.rst
@@ -11,6 +11,12 @@ Breaking Changes
   accounting for array incidence loss (IAM), and including losses in the PVWatts
   inverter model. Additionally, added optional arguments for bounding the azimuth range in
   during least squares optimization. (:issue:`147`, :pull:`180`)
+* Updated function :py:func:`~pvanalytics.quality.time.shifts_ruptures` to align with the
+  methodology tested and reported on at PVRW 2023 ("Survey of Time Shift Detection Algorithms
+  for Measured PV Data"). This includes converting the changepoint detection algorithm from
+  Pelt to Binary Segmentation (which runs much faster), and performing additional processing
+  to each detected segment to remove outliers and filter by a quantile cutoff instead of the
+  original rounding technique. (:pull:`197`)
 
 
 Enhancements

diff --git a/pvanalytics/quality/time.py b/pvanalytics/quality/time.py
@@ -1,7 +1,7 @@
 """Quality tests related to time."""
 import warnings
 import pandas as pd
-import numpy as np
+from pvanalytics.quality.outliers import zscore
 
 
 def spacing(times, freq):
@@ -37,38 +37,23 @@ def spacing(times, freq):
     return delta == freq
 
 
-def _round_multiple(x, to, up_from=None):
-    # Return `x` rounded to the nearest multiple of `to`
-    #
-    # Parameters
-    # ----------
-    # x : Series
-    # to : int
-    #     `x` is rounded to a multiple of `to`
-    # up_from : int, optional
-    #     If the remainder of `x` / `to` is greater than `up_from`
-    #     then `x` is rounded up, otherwise `x` is rounded down.
-    #     If not specified rounding will go up from `to` // 2.
-    up_from = up_from or to // 2
-    quotient, remainder = divmod(abs(x), to)
-    remainder[remainder > up_from] = to
-    remainder[remainder <= up_from] = 0
-    return np.sign(x) * (quotient*to + remainder)
-
-
-def shifts_ruptures(event_times, reference_times, period_min=2,
-                    shift_min=15, round_up_from=None,
-                    prediction_penalty=13):
+def shifts_ruptures(event_times, reference_times,
+                    period_min=15,
+                    shift_min=15,
+                    prediction_penalty=20,
+                    zscore_cutoff=2,
+                    bottom_quantile_threshold=0,
+                    top_quantile_threshold=.5):
     """Identify time shifts using the ruptures library.
 
     Compares the event time in the expected time zone (`reference_times`)
     with the actual event time in `event_times`.
 
-    The Pelt changepoint detection method is applied to the difference
-    between `event_times` and `reference_times`. For each period between
-    change points the mode of the difference is rounded to a multiple of
-    `shift_min` and returned as the time-shift for all days in that
-    period.
+    The Binary Segmentation changepoint detection method is applied to the
+    difference between `event_times` and `reference_times`. For each period
+    between change points the mode of the difference is rounded to a
+    multiple of `shift_min` and returned as the time-shift for all days in
+    that period.
 
     Parameters
     ----------
@@ -81,27 +66,32 @@ def shifts_ruptures(event_times, reference_times, period_min=2,
         timezone. For example, passing solar transit time in a fixed offset
         time zone can be used to detect daylight savings shifts when it is
         unknown whether or not `event_times` is in a fixed offset time zone.
-    period_min : int, default 2
+    period_min : int, default 15
         Minimum number of days between shifts. Must be less than or equal to
         the number of days in `event_times`. [days]
-
         Increasing this parameter will make the result less sensitive to
         transient shifts. For example if your intent is to find and correct
         daylight savings time shifts passing `period_min=60` can give good
         results while excluding shorter periods that appear shifted.
     shift_min : int, default 15
         Minimum shift amount in minutes. All shifts are rounded to a multiple
         of `shift_min`. [minutes]
-    round_up_from : int, optional
-        The number of minutes greater than a multiple of `shift_min` for a
-        shift to be rounded up. If a shift is less than `round_up_from` then
-        it will be rounded towards 0. If not specified then the shift will
-        be rounded up from `shift_min // 2`. Using a larger value will
-        effectively make the shift detection more conservative as small
-        variations will tend to be rounded to zero. [minutes]
     prediction_penalty : int, default 13
         Penalty used in assessing change points.
-        See :py:meth:`ruptures.detection.Pelt.predict` for more information.
+        See :py:meth:`ruptures.detection.Binseg.predict` for more information.
+    zscore_cutoff : int, default 2
+        Z-score cutoff / maximum for filtering out outliers in each identified
+        segment found via changepount detection
+    bottom_quantile_threshold : float, default 0
+        Bottom quantile threshold for each time series segment identified via
+        changepoint detection. All data below this threshold is not considered
+        when determining the mean value for the segment, which is later
+        rounded to the nearest `period_min` value
+    top_quantile_threshold : float, default 0.5
+        Top quantile threshold for each time series segment identified via
+        changepoint detection. All data above this threshold is not considered
+        when determining the mean value for the segment, which is later
+        rounded to the nearest `period_min` value
 
     Returns
     -------
@@ -141,6 +131,11 @@ def shifts_ruptures(event_times, reference_times, period_min=2,
 
     Derived from the PVFleets QA project.
 
+    References
+    -------
+    .. [1] Perry K., Meyers B., and Muller, M. "Survey of Time Shift
+        Detection Algorithms for Measured PV Data", 2023 PV Reliability
+        Workshop (PVRW).
     """
     try:
         import ruptures
@@ -150,35 +145,48 @@ def shifts_ruptures(event_times, reference_times, period_min=2,
     if period_min > len(event_times):
         raise ValueError("period_min exceeds number of days in event_times")
     # Drop timezone information. At this point there is one value per day
-    # so the timezone is irrelevant.
+    # so the timezone is irrelevant. Get the time difference in minutes.
     time_diff = \
-        event_times.tz_localize(None) - reference_times.tz_localize(None)
-    break_points = ruptures.Pelt(
-        model='rbf',
-        jump=1,
-        min_size=period_min
-    ).fit_predict(
-        signal=time_diff.values,
-        pen=prediction_penalty
-    )
+        (event_times.tz_localize(None) -
+         reference_times.tz_localize(None))
+    # Get the index before removing NaN's
+    time_diff_orig_index = time_diff.index
+    # Remove NaN's from the time_diff series, because NaN's screw up the
+    # ruptures prediction
+    time_diff = time_diff.dropna()
+    # Run changepoint detection to find breaks
+    break_points = ruptures.Binseg(model='rbf',
+                                   jump=1,
+                                   min_size=period_min).fit_predict(
+                                       signal=time_diff.values,
+                                       pen=prediction_penalty)
     # Make sure the entire series is covered by the intervals between
     # the breakpoints that were identified above. This means adding a
     # breakpoint at the beginning of the series (0) and at the end if
     # one does not already exist.
     break_points.insert(0, 0)
     if break_points[-1] != len(time_diff):
         break_points.append(len(time_diff))
-    time_diff = _round_multiple(time_diff, shift_min, round_up_from)
-    shift_amount = time_diff.groupby(
-        pd.cut(
-            time_diff.reset_index().index,
-            break_points,
-            include_lowest=True, right=False,
-            duplicates='drop'
-        )
-    ).transform(
-        lambda shifted_period: shifted_period.mode().values[0]
-    )
+    # Go through each time shift segment and perform the following steps:
+    # 1) Remove the outliers from each segment using a z-score filter
+    # 2) Remove any cases that are not within the bottom and top quantile
+    #    parameters for the segment. This helps to remove more erroneous
+    #    data.
+    # 3) Take the mean of each segment and round it to the nearest shift_min
+    #    multiplier
+    shift_amount = time_diff.copy()
+    for index in range(len(break_points) - 1):
+        segment = time_diff[break_points[index]:
+                            break_points[index + 1]]
+        segment = segment[~zscore(segment, zmax=zscore_cutoff,
+                                  nan_policy='omit')]
+        segment = segment[
+            (segment >= segment.quantile(bottom_quantile_threshold)) &
+            (segment <= segment.quantile(top_quantile_threshold))]
+        shift_amount.iloc[break_points[index]: break_points[index + 1]] = \
+            shift_min * round(float(segment.mean())/shift_min)
+    # Update the shift_amount series with the original time series index
+    shift_amount = shift_amount.reindex(time_diff_orig_index).ffill()
     # localize the shift_amount series to the timezone of the input
     shift_amount = shift_amount.tz_localize(event_times.index.tz)
     return shift_amount != 0, shift_amount

diff --git a/pvanalytics/tests/quality/test_time.py b/pvanalytics/tests/quality/test_time.py
@@ -196,7 +196,7 @@ def test_has_dst_no_dst_in_date_range(albuquerque):
 def midday(request, albuquerque):
     solar_position = albuquerque.get_solarposition(
         pd.date_range(
-            start='1/1/2020', end='2/29/2020 23:59',
+            start='1/1/2020', end='3/30/2020 23:59',
             tz='MST', freq=request.param
         )
     )
@@ -233,10 +233,10 @@ def test_shift_ruptures_positive_shift(midday):
     shifted = _shift_between(
         midday, 60,
         start='2020-01-01',
-        end='2020-02-29'
+        end='2020-03-30'
     )
     expected_shift_mask = pd.Series(False, index=midday.index)
-    expected_shift_mask['2020-01-01':'2020-02-29'] = True
+    expected_shift_mask['2020-01-01':'2020-03-30'] = True
     shift_mask, shift_amounts = time.shifts_ruptures(shifted, midday)
     assert_series_equal(shift_mask, expected_shift_mask, check_names=False)
     assert_series_equal(
@@ -251,10 +251,10 @@ def test_shift_ruptures_negative_shift(midday):
     shifted = _shift_between(
         midday, -60,
         start='2020-01-01',
-        end='2020-02-29'
+        end='2020-03-30'
     )
     expected_shift_mask = pd.Series(False, index=midday.index)
-    expected_shift_mask['2020-01-01':'2020-02-29'] = True
+    expected_shift_mask['2020-01-01':'2020-03-30'] = True
     shift_mask, shift_amounts = time.shifts_ruptures(shifted, midday)
     assert_series_equal(shift_mask, expected_shift_mask, check_names=False)
     assert_series_equal(
@@ -294,9 +294,11 @@ def _shift_between(series, shift, start, end):
 @requires_ruptures
 def test_shift_ruptures_period_min(midday):
     no_shifts = pd.Series(0, index=midday.index, dtype='int64')
+    # period_min must be equal to length of series / 2 or less in order for
+    # binary segmentation algoritm to work.
     shift_mask, shift_amount = time.shifts_ruptures(
         midday, midday,
-        period_min=len(midday)
+        period_min=len(midday) / 2
     )
     assert not shift_mask.any()
     assert_series_equal(
@@ -344,10 +346,10 @@ def test_shifts_ruptures_shift_at_end(midday):
     shifted = _shift_between(
         midday, 60,
         start='2020-02-01',
-        end='2020-02-29'
+        end='2020-03-30'
     )
     shift_expected = pd.Series(0, index=shifted.index, dtype='int64')
-    shift_expected['2020-02-02':'2020-02-29'] = 60
+    shift_expected['2020-02-02':'2020-03-30'] = 60
     shift_mask, shift_amount = time.shifts_ruptures(shifted, midday)
     assert_series_equal(shift_mask, shift_expected != 0, check_names=False)
     assert_series_equal(
@@ -362,11 +364,12 @@ def test_shifts_ruptures_shift_in_middle(midday):
     shifted = _shift_between(
         midday, 60,
         start='2020-01-25',
-        end='2020-02-15'
+        end='2020-03-05'
     )
     shift_expected = pd.Series(0, index=shifted.index, dtype='int64')
-    shift_expected['2020-01-26':'2020-02-15'] = 60
-    shift_mask, shift_amount = time.shifts_ruptures(shifted, midday)
+    shift_expected['2020-01-26':'2020-03-05'] = 60
+    shift_mask, shift_amount = time.shifts_ruptures(shifted, midday,
+                                                    prediction_penalty=13)
     assert_series_equal(
         shift_mask,
         shift_expected != 0,
@@ -391,7 +394,7 @@ def test_shift_ruptures_shift_min(midday):
     no_shift = pd.Series(0, index=shifted.index, dtype='int64')
     shift_mask, shift_amount = time.shifts_ruptures(
         shifted, midday,
-        shift_min=60, round_up_from=40
+        shift_min=60
     )
     assert not shift_mask.any()
     assert_series_equal(
@@ -482,25 +485,3 @@ def test_dst_dates(timezone, expected_dates):
         time.dst_dates(index.tz_localize(None), timezone),
         expected.tz_localize(None)
     )
-
-
-def test_rounding():
-    xs = pd.Series(
-        [-10, 10, -16, 16, -28, 28, -30, 30, -8, 8, -7, 7, -3, 3, 0]
-    )
-    assert_series_equal(
-        time._round_multiple(xs, 15),
-        pd.Series([-15, 15, -15, 15, -30, 30, -30, 30, -15, 15, 0, 0, 0, 0, 0])
-    )
-    assert_series_equal(
-        time._round_multiple(xs, 15, up_from=9),
-        pd.Series([-15, 15, -15, 15, -30, 30, -30, 30, 0, 0, 0, 0, 0, 0, 0])
-    )
-    assert_series_equal(
-        time._round_multiple(xs, 15, up_from=15),
-        pd.Series([0, 0, -15, 15, -15, 15, -30, 30, 0, 0, 0, 0, 0, 0, 0])
-    )
-    assert_series_equal(
-        time._round_multiple(xs, 30),
-        pd.Series([0, 0, -30, 30, -30, 30, -30, 30, 0, 0, 0, 0, 0, 0, 0])
-    )