Fix IndexError in PyCBC Live's followup code (gwastro#4676)

* Draft fix for followup indexing error * Use a nonzero min required lookback, better variable names * Alex's comment on td_samples vs delta_f * Add docstring for `followup_event_significance` * Docstring fixes * Work on comments * Try to fix Sphinx error * Simplify definition of strain buffer length * Explicit kwarg name Co-authored-by: maxtrevor <[email protected]> * Simplify `from_cli()` args * Fix error * Update example --------- Co-authored-by: maxtrevor <[email protected]>
maxtrevor · Apr 10, 2024 · 1198d96 · 1198d96
1 parent 0d98014
commit 1198d96
Show file tree

Hide file tree

Showing 5 changed files with 223 additions and 79 deletions.
diff --git a/bin/pycbc_live b/bin/pycbc_live
@@ -104,6 +104,7 @@ class LiveEventManager(object):
         self.padata = livepau.PAstroData(args.p_astro_spec, args.bank_file)
         self.use_date_prefix = args.day_hour_output_prefix
         self.ifar_upload_threshold = args.ifar_upload_threshold
+        self.pvalue_lookback_time = args.pvalue_lookback_time
         self.pvalue_livetime = args.pvalue_combination_livetime
         self.gracedb_server = args.gracedb_server
         self.gracedb_search = args.gracedb_search
@@ -221,7 +222,8 @@ class LiveEventManager(object):
                 self.data_readers[ifo],
                 self.bank,
                 template_id,
-                coinc_times
+                coinc_times,
+                lookback=self.pvalue_lookback_time
             )
             if pvalue_info is None:
                 continue
@@ -802,6 +804,20 @@ class LiveEventManager(object):
                 store_psd[ifo].save(fname, group='%s/psd' % ifo)
 
 
+def check_max_length(args, waveforms):
+    """Check that the `--max-length` option is sufficient to accomodate the
+    longest template in the bank and the PSD estimation options.
+    """
+    lengths = numpy.array([1.0 / wf.delta_f for wf in waveforms])
+    psd_len = args.psd_segment_length * (args.psd_samples // 2 + 1)
+    max_length = max(lengths.max() + args.pvalue_lookback_time, psd_len)
+    if max_length > args.max_length:
+        raise ValueError(
+            '--max-length is too short for this template bank. '
+            f'Use at least {max_length}.'
+        )
+
+
 parser = argparse.ArgumentParser(description=__doc__)
 pycbc.waveform.bank.add_approximant_arg(parser)
 parser.add_argument('--verbose', action='store_true')
@@ -962,6 +978,10 @@ parser.add_argument('--enable-background-estimation', default=False, action='sto
 parser.add_argument('--ifar-double-followup-threshold', type=float, required=True,
                     help='Inverse-FAR threshold to followup double coincs with'
                          'additional detectors')
+parser.add_argument('--pvalue-lookback-time', type=float, default=150,
+                    metavar='SECONDS',
+                    help='Lookback time for the calculation of the p-value in '
+                         'followup detectors.')
 parser.add_argument('--pvalue-combination-livetime', type=float, required=True,
                     help="Livetime used for p-value combination with followup "
                          "detectors, in years")
@@ -1106,13 +1126,10 @@ with ctx:
         print(e)
         exit()
 
-    maxlen = args.psd_segment_length * (args.psd_samples // 2 + 1)
     if evnt.rank > 0:
         bank.table.sort(order='mchirp')
         waveforms = list(bank[evnt.rank-1::evnt.size-1])
-        lengths = numpy.array([1.0 / wf.delta_f for wf in waveforms])
-        psd_len = args.psd_segment_length * (args.psd_samples // 2 + 1)
-        maxlen = max(lengths.max(), psd_len)
+        check_max_length(args, waveforms)
         mf = LiveBatchMatchedFilter(waveforms, args.snr_threshold,
                                     args.chisq_bins, sg_chisq,
                                     snr_abort_threshold=args.snr_abort_threshold,
@@ -1134,11 +1151,8 @@ with ctx:
     # Initialize the data readers for all detectors. For rank 0, we need data
     # from all detectors, including the localization-only ones. For higher
     # ranks, we only need the detectors that can generate candidates.
-    if args.max_length is not None:
-        maxlen = args.max_length
-    maxlen = int(maxlen)
     data_reader = {
-        ifo: StrainBuffer.from_cli(ifo, args, maxlen)
+        ifo: StrainBuffer.from_cli(ifo, args)
         for ifo in (evnt.ifos if evnt.rank == 0 else evnt.trigg_ifos)
     }
     evnt.data_readers = data_reader

diff --git a/examples/live/run.sh b/examples/live/run.sh
@@ -131,7 +131,7 @@ python -m mpi4py `which pycbc_live` \
 --sample-rate 2048 \
 --enable-bank-start-frequency \
 --low-frequency-cutoff ${f_min} \
---max-length 256 \
+--max-length 512 \
 --approximant "SPAtmplt:mtotal<4" "SEOBNRv4_ROM:else" \
 --chisq-bins "0.72*get_freq('fSEOBNRv4Peak',params.mass1,params.mass2,params.spin1z,params.spin2z)**0.7" \
 --snr-abort-threshold 500 \

diff --git a/pycbc/filter/matchedfilter.py b/pycbc/filter/matchedfilter.py
@@ -1804,7 +1804,50 @@ def followup_event_significance(ifo, data_reader, bank,
     to determine if the SNR in the first detector has a significant peak
     in the on-source window. The significance is given in terms of a
     p-value. See Dal Canton et al. 2021 (https://arxiv.org/abs/2008.07494)
-    for details.
+    for details. A portion of the SNR time series around the on-source window
+    is also returned for use in BAYESTAR.
+
+    If the calculation cannot be carried out, for example because `ifo` is
+    not in observing mode at the requested time, then None is returned.
+    Otherwise, the dict contains the following keys. `snr_series` is a
+    TimeSeries object with the SNR time series for BAYESTAR. `peak_time` is the
+    time of maximum SNR in the on-source window. `pvalue` is the p-value for
+    the maximum on-source SNR compared to the off-source realizations.
+    `pvalue_saturated` is a bool indicating whether the p-value is limited by
+    the number of off-source realizations, i.e. whether the maximum on-source
+    SNR is larger than all the off-source ones. `sigma2` is the SNR
+    normalization (squared) for the given template and detector.
+
+    Parameters
+    ----------
+    ifo: str
+        Which detector is being used for the calculation.
+    data_reader: StrainBuffer
+        StrainBuffer object providing the data for the given detector.
+    bank: LiveFilterBank
+        Template bank object providing the template related quantities.
+    template_id: int
+        Index of the template in the bank.
+    coinc_times: dict
+        Dictionary keyed by detector names reporting the coalescence times of
+        a candidate measured at the different detectors. Used to define the
+        on-source window of the candidate in `ifo`.
+    coinc_threshold: float
+        Nominal statistical uncertainty in `coinc_times`; expands the
+        on-source window by twice the given amount.
+    lookback: float
+        Nominal amount of time to use for the calculation of the onsource and
+        offsource SNR time series. The actual time may be reduced depending on
+        the duration of the template and the strain buffer in the data reader
+        (if so, a warning is logged).
+    duration: float
+        Duration of the SNR time series to be reported to BAYESTAR.
+
+    Returns
+    -------
+    followup_info: dict or None
+        Results of the followup calculation (see above) or None if `ifo` did
+        not have usable data.
     """
     from pycbc.waveform import get_waveform_filter_length_in_time
     tmplt = bank.table[template_id]
@@ -1830,17 +1873,63 @@ def followup_event_significance(ifo, data_reader, bank,
     onsource_start -= coinc_threshold
     onsource_end += coinc_threshold
 
-    # Calculate how much time needed to calculate significance
-    trim_pad = (data_reader.trim_padding * data_reader.strain.delta_t)
-    bdur = int(lookback + 2.0 * trim_pad + length_in_time)
-    if bdur > data_reader.strain.duration * .75:
-        bdur = data_reader.strain.duration * .75
+    # Calculate how much time is needed to calculate the significance.
+    # At the minimum, we need enough time to include the lookback, plus time
+    # that we will throw away because of corruption from finite-duration filter
+    # responses (this is equal to the nominal padding plus the template
+    # duration). Next, for efficiency, we round the resulting duration up to
+    # align it with one of the frequency resolutions preferred by the template
+    # bank. And finally, the resulting duration must fit into the strain buffer
+    # available in the data reader, so we check that.
+    trim_pad = data_reader.trim_padding * data_reader.strain.delta_t
+    buffer_duration = lookback + 2 * trim_pad + length_in_time
+    buffer_samples = bank.round_up(int(buffer_duration * bank.sample_rate))
+    max_safe_buffer_samples = int(
+        0.9 * data_reader.strain.duration * bank.sample_rate
+    )
+    if buffer_samples > max_safe_buffer_samples:
+        buffer_samples = max_safe_buffer_samples
+        new_lookback = (
+            buffer_samples / bank.sample_rate - (2 * trim_pad + length_in_time)
+        )
+        # Require a minimum lookback time of twice the onsource window or SNR
+        # time series (whichever is longer) so we have enough data for the
+        # onsource window, the SNR time series, and at least a few background
+        # samples
+        min_required_lookback = 2 * max(onsource_end - onsource_start, duration)
+        if new_lookback > min_required_lookback:
+            logging.warning(
+                'Strain buffer too short for a lookback time of %f s, '
+                'reducing lookback to %f s',
+                lookback,
+                new_lookback
+            )
+        else:
+            logging.error(
+                'Strain buffer too short to compute the followup SNR time '
+                'series for template %d, will not use %s for followup. '
+                'Either use shorter templates, or raise --max-length.',
+                template_id,
+                ifo
+            )
+            return None
+    buffer_duration = buffer_samples / bank.sample_rate
 
     # Require all strain be valid within lookback time
     if data_reader.state is not None:
-        state_start_time = data_reader.strain.end_time \
-                - data_reader.reduced_pad * data_reader.strain.delta_t - bdur
-        if not data_reader.state.is_extent_valid(state_start_time, bdur):
+        state_start_time = (
+            data_reader.strain.end_time
+            - data_reader.reduced_pad * data_reader.strain.delta_t
+            - buffer_duration
+        )
+        if not data_reader.state.is_extent_valid(
+            state_start_time, buffer_duration
+        ):
+            logging.info(
+                '%s strain buffer contains invalid data during lookback, '
+                'will not use for followup',
+                ifo
+            )
             return None
 
     # We won't require that all DQ checks be valid for now, except at
@@ -1849,16 +1938,23 @@ def followup_event_significance(ifo, data_reader, bank,
         dq_start_time = onsource_start - duration / 2.0
         dq_duration = onsource_end - onsource_start + duration
         if not data_reader.dq.is_extent_valid(dq_start_time, dq_duration):
+            logging.info(
+                '%s DQ buffer indicates invalid data during onsource window, '
+                'will not use for followup',
+                ifo
+            )
             return None
 
-    # Calculate SNR time series for this duration
-    htilde = bank.get_template(template_id, min_buffer=bdur)
+    # Calculate SNR time series for the entire lookback duration
+    htilde = bank.get_template(
+        template_id, delta_f=bank.sample_rate / float(buffer_samples)
+    )
     stilde = data_reader.overwhitened_data(htilde.delta_f)
 
     sigma2 = htilde.sigmasq(stilde.psd)
     snr, _, norm = matched_filter_core(htilde, stilde, h_norm=sigma2)
 
-    # Find peak in on-source and determine p-value
+    # Find peak SNR in on-source and determine p-value
     onsrc = snr.time_slice(onsource_start, onsource_end)
     peak = onsrc.abs_arg_max()
     peak_time = peak * snr.delta_t + onsrc.start_time

diff --git a/pycbc/strain/strain.py b/pycbc/strain/strain.py
@@ -1418,8 +1418,8 @@ def execute_cached_ifft(*args, **kwargs):
 
 class StrainBuffer(pycbc.frame.DataBuffer):
     def __init__(self, frame_src, channel_name, start_time,
-                 max_buffer=512,
-                 sample_rate=4096,
+                 max_buffer,
+                 sample_rate,
                  low_frequency_cutoff=20,
                  highpass_frequency=15.0,
                  highpass_reduction=200.0,
@@ -1460,9 +1460,9 @@ def __init__(self, frame_src, channel_name, start_time,
             Name of the channel to read from the frame files
         start_time:
             Time to start reading from.
-        max_buffer: {int, 512}, Optional
-            Length of the buffer in seconds
-        sample_rate: {int, 2048}, Optional
+        max_buffer: int
+            Length of the strain buffer in seconds.
+        sample_rate: int, Optional
             Rate in Hz to sample the data.
         low_frequency_cutoff: {float, 20}, Optional
             The low frequency cutoff to use for inverse spectrum truncation
@@ -1534,7 +1534,7 @@ def __init__(self, frame_src, channel_name, start_time,
             filesystem.
         """
         super(StrainBuffer, self).__init__(frame_src, channel_name, start_time,
-                                           max_buffer=32,
+                                           max_buffer=max_buffer,
                                            force_update_cache=force_update_cache,
                                            increment_update_cache=increment_update_cache)
 
@@ -1950,7 +1950,7 @@ def advance(self, blocksize, timeout=10):
         return self.wait_duration <= 0
 
     @classmethod
-    def from_cli(cls, ifo, args, maxlen):
+    def from_cli(cls, ifo, args):
         """Initialize a StrainBuffer object (data reader) for a particular
         detector.
         """
@@ -1982,34 +1982,38 @@ def from_cli(cls, ifo, args, maxlen):
             frame_src = [args.frame_src[ifo]]
         strain_channel = ':'.join([ifo, args.channel_name[ifo]])
 
-        return cls(frame_src, strain_channel,
-                   args.start_time, max_buffer=maxlen * 2,
-                   state_channel=state_channel,
-                   data_quality_channel=dq_channel,
-                   idq_channel=idq_channel,
-                   idq_state_channel=idq_state_channel,
-                   idq_threshold=args.idq_threshold,
-                   sample_rate=args.sample_rate,
-                   low_frequency_cutoff=args.low_frequency_cutoff,
-                   highpass_frequency=args.highpass_frequency,
-                   highpass_reduction=args.highpass_reduction,
-                   highpass_bandwidth=args.highpass_bandwidth,
-                   psd_samples=args.psd_samples,
-                   trim_padding=args.trim_padding,
-                   psd_segment_length=args.psd_segment_length,
-                   psd_inverse_length=args.psd_inverse_length,
-                   autogating_threshold=args.autogating_threshold,
-                   autogating_cluster=args.autogating_cluster,
-                   autogating_pad=args.autogating_pad,
-                   autogating_width=args.autogating_width,
-                   autogating_taper=args.autogating_taper,
-                   autogating_duration=args.autogating_duration,
-                   autogating_psd_segment_length=args.autogating_psd_segment_length,
-                   autogating_psd_stride=args.autogating_psd_stride,
-                   psd_abort_difference=args.psd_abort_difference,
-                   psd_recalculate_difference=args.psd_recalculate_difference,
-                   force_update_cache=args.force_update_cache,
-                   increment_update_cache=args.increment_update_cache[ifo],
-                   analyze_flags=analyze_flags,
-                   data_quality_flags=dq_flags,
-                   dq_padding=args.data_quality_padding)
+        return cls(
+            frame_src,
+            strain_channel,
+            args.start_time,
+            max_buffer=args.max_length,
+            state_channel=state_channel,
+            data_quality_channel=dq_channel,
+            idq_channel=idq_channel,
+            idq_state_channel=idq_state_channel,
+            idq_threshold=args.idq_threshold,
+            sample_rate=args.sample_rate,
+            low_frequency_cutoff=args.low_frequency_cutoff,
+            highpass_frequency=args.highpass_frequency,
+            highpass_reduction=args.highpass_reduction,
+            highpass_bandwidth=args.highpass_bandwidth,
+            psd_samples=args.psd_samples,
+            trim_padding=args.trim_padding,
+            psd_segment_length=args.psd_segment_length,
+            psd_inverse_length=args.psd_inverse_length,
+            autogating_threshold=args.autogating_threshold,
+            autogating_cluster=args.autogating_cluster,
+            autogating_pad=args.autogating_pad,
+            autogating_width=args.autogating_width,
+            autogating_taper=args.autogating_taper,
+            autogating_duration=args.autogating_duration,
+            autogating_psd_segment_length=args.autogating_psd_segment_length,
+            autogating_psd_stride=args.autogating_psd_stride,
+            psd_abort_difference=args.psd_abort_difference,
+            psd_recalculate_difference=args.psd_recalculate_difference,
+            force_update_cache=args.force_update_cache,
+            increment_update_cache=args.increment_update_cache[ifo],
+            analyze_flags=analyze_flags,
+            data_quality_flags=dq_flags,
+            dq_padding=args.data_quality_padding
+        )