From ec8bd5def9919db8187ab3b4f6bebbdacf98efae Mon Sep 17 00:00:00 2001 From: Gareth S Cabourn Davies Date: Mon, 5 Feb 2024 09:20:04 +0000 Subject: [PATCH] Some changes to HFile.select() and SingleDetTriggers (#4545) * Some changes to HFile.select() and mask_to_n_loudest_clustered * missed comma * Some CC problemas * Update * revert changes which meant kwargs couldnt be passed to mask_to_n_loudest_clustered_events * CC / fixes * Use list rather than numpy array for mask * remove messing about with the mask dtyping * initialise a transparent mask if it is not given * Fix problem when the filter value may not be available to the HFile * thinko in pycbc_bin_trigger_rates_dq * Don't make a huge boolean mask of all True values and then run nonzero on it = BAD IDEA * CC * re-enable option to have no mask * efficiency saving in pycbc_plot_hist * no mask option in trig_dict * typo * allow None maks in mask_size * thought i could apply another mask but I cant * get filter_func back, use new interface for recent changes * Some minor fixes * cc * allow option to do a logical and on masks * various minor bugfixes * max z problem * fixes to pycbc_page_snglinfo * use alternative dict for beter clarity * IWH comments * add check for required datasets (mostly to remind people to add the required dataset list if they add a new ranking\!) * fix bad check, remove min_snr option * remove option from CI * Fix bug with applying premask and using filter_rank threshold at the same time * stop checking for required datasets - this is difficult when so may different types use the same function * CC prefers np.flatnonzero(arr) to arr.nonzero()[0] * Fix a couple of rarely-used scripts, remove unused imports, whitespace --- bin/all_sky_search/pycbc_bin_trigger_rates_dq | 60 +-- .../pycbc_fit_sngls_split_binned | 35 +- .../pycbc_injection_minifollowup | 42 +- bin/minifollowups/pycbc_page_snglinfo | 50 ++- .../pycbc_plot_trigger_timeseries | 15 +- bin/minifollowups/pycbc_sngl_minifollowup | 32 +- bin/plotting/pycbc_plot_hist | 19 +- bin/plotting/pycbc_plot_singles_timefreq | 18 +- bin/plotting/pycbc_plot_singles_vs_params | 65 ++-- bin/pycbc_fit_sngl_trigs | 27 +- bin/pycbc_process_sngls | 11 +- docs/hwinj.rst | 2 +- examples/search/plotting.ini | 3 +- pycbc/events/ranking.py | 17 +- pycbc/io/hdf.py | 363 ++++++++++++++---- 15 files changed, 468 insertions(+), 291 deletions(-) diff --git a/bin/all_sky_search/pycbc_bin_trigger_rates_dq b/bin/all_sky_search/pycbc_bin_trigger_rates_dq index 1bb84299bd0..25234a687f0 100644 --- a/bin/all_sky_search/pycbc_bin_trigger_rates_dq +++ b/bin/all_sky_search/pycbc_bin_trigger_rates_dq @@ -15,7 +15,7 @@ from pycbc.events.veto import (select_segments_by_definer, start_end_to_segments, segments_to_start_end) from pycbc.types.optparse import MultiDetOptionAction -from pycbc.io.hdf import HFile, SingleDetTriggers +from pycbc.io.hdf import SingleDetTriggers from pycbc.version import git_verbose_msg as version parser = argparse.ArgumentParser(description=__doc__) @@ -46,54 +46,11 @@ logging.info('Start') ifo, flag_name = args.flag_name.split(':') -# Setup a data mask to remove any triggers with SNR below threshold -# This works as a pre-filter as SNR is always greater than or equal -# to sngl_ranking, except in the psdvar case, where it could increase. - -with HFile(args.trig_file, 'r') as trig_file: - n_triggers_orig = trig_file[f'{ifo}/snr'].size - logging.info("Trigger file has %d triggers", n_triggers_orig) - logging.info('Generating trigger mask') - if f'{ifo}/psd_var_val' in trig_file: - idx, _, _ = trig_file.select( - lambda snr, psdvar: snr / psdvar ** 0.5 >= args.stat_threshold, - f'{ifo}/snr', - f'{ifo}/psd_var_val', - return_indices=True - ) - else: - # psd_var_val may not have been calculated - idx, _ = trig_file.select( - lambda snr: snr >= args.stat_threshold, - f'{ifo}/snr', - return_indices=True - ) - data_mask = np.zeros(n_triggers_orig, dtype=bool) - data_mask[idx] = True - - # also get the gated times while we have the file open - gate_times = [] - if args.gating_windows: - logging.info('Getting gated times') - try: - gating_types = trig_file[f'{ifo}/gating'].keys() - for gt in gating_types: - gate_times += list(trig_file[f'{ifo}/gating/{gt}/time'][:]) - gate_times = np.unique(gate_times) - except KeyError: - logging.warning('No gating found in trigger file') - -logging.info("Getting %s triggers from file with pre-cut SNR > %.3f", - idx.size, args.stat_threshold) - trigs = SingleDetTriggers( args.trig_file, - None, - None, - None, - None, ifo, - premask=data_mask + filter_rank=args.sngl_ranking, + filter_threshold=args.stat_threshold, ) # Extract the data we actually need from the data structure: @@ -101,16 +58,6 @@ tmplt_ids = trigs.template_id trig_times = trigs.end_time stat = trigs.get_ranking(args.sngl_ranking) -n_triggers = tmplt_ids.size - -logging.info("Applying %s > %.3f cut", args.sngl_ranking, - args.stat_threshold) -keep = stat >= args.stat_threshold -tmplt_ids = tmplt_ids[keep] -trig_times = trig_times[keep] -logging.info("Removed %d triggers, %d remain", - n_triggers - tmplt_ids.size, tmplt_ids.size) - # Get the template bins bin_tids_dict = {} with h5.File(args.template_bins_file, 'r') as f: @@ -118,7 +65,6 @@ with h5.File(args.template_bins_file, 'r') as f: for bin_name in ifo_grp.keys(): bin_tids_dict[bin_name] = ifo_grp[bin_name]['tids'][:] - # get analysis segments analysis_segs = select_segments_by_definer( args.analysis_segment_file, diff --git a/bin/all_sky_search/pycbc_fit_sngls_split_binned b/bin/all_sky_search/pycbc_fit_sngls_split_binned index 79fa51e575f..d77581ca81b 100644 --- a/bin/all_sky_search/pycbc_fit_sngls_split_binned +++ b/bin/all_sky_search/pycbc_fit_sngls_split_binned @@ -22,7 +22,7 @@ from matplotlib import pyplot as plt import numpy as np from pycbc import events, bin_utils, results -from pycbc.io import HFile, SingleDetTriggers +from pycbc.io import SingleDetTriggers from pycbc.events import triggers as trigs from pycbc.events import trigger_fits as trstats from pycbc.events import stat as pystat @@ -202,40 +202,12 @@ boundaries = trigf[args.ifo + '/template_boundaries'][:] trigf.close() -# Setup a data mask to remove any triggers with SNR below threshold -# This works as a pre-filter as SNR is always greater than or equal -# to sngl_ranking, except in the psdvar case, where it could increase. -with HFile(args.trigger_file, 'r') as trig_file: - n_triggers_orig = trig_file[f'{args.ifo}/snr'].size - logging.info("Trigger file has %d triggers", n_triggers_orig) - logging.info('Generating trigger mask') - if f'{args.ifo}/psd_var_val' in trig_file: - idx, _, _ = trig_file.select( - lambda snr, psdvar: snr / psdvar ** 0.5 >= args.plot_lower_stat_limit, - f'{args.ifo}/snr', - f'{args.ifo}/psd_var_val', - return_indices=True - ) - else: - # psd_var_val may not have been calculated - idx, _ = trig_file.select( - lambda snr: snr >= args.plot_lower_stat_limit, - f'{args.ifo}/snr', - return_indices=True - ) - data_mask = np.zeros(n_triggers_orig, dtype=bool) - data_mask[idx] = True - - logging.info('Calculating single stat values from trigger file') trigs = SingleDetTriggers( args.trigger_file, - None, - None, - None, - None, args.ifo, - premask=data_mask + filter_rank=args.sngl_ranking, + filter_threshold=args.plot_lower_stat_limit, ) # This is the direct pointer to the HDF file, used later on trigf = trigs.trigs_f @@ -243,7 +215,6 @@ trigf = trigs.trigs_f stat = trigs.get_ranking(args.sngl_ranking) time = trigs.end_time - logging.info('Processing template boundaries') max_boundary_id = np.argmax(boundaries) sorted_boundary_list = np.sort(boundaries) diff --git a/bin/minifollowups/pycbc_injection_minifollowup b/bin/minifollowups/pycbc_injection_minifollowup index 198ca2a34bf..768c475c253 100644 --- a/bin/minifollowups/pycbc_injection_minifollowup +++ b/bin/minifollowups/pycbc_injection_minifollowup @@ -244,15 +244,17 @@ def nearby_missedinj(endtime, snr): trigger_idx = {} trigger_snrs = {} trigger_times = {} +# This finds the triggers near to _any_ missed injection for trig in single_triggers: ifo = trig.ifo with HFile(trig.lfn, 'r') as trig_f: - trigger_idx[ifo], trigger_times[ifo], trigger_snrs[ifo] = \ + trigger_idx[ifo], data_tuple = \ trig_f.select( nearby_missedinj, f'{ifo}/end_time', f'{ifo}/snr', - return_indices=True) + ) + trigger_times[ifo], trigger_snrs[ifo] = data_tuple # figure out how many injections to follow up num_events = int(workflow.cp.get_opt_tags( @@ -305,7 +307,7 @@ for num_event in range(num_events): tags=args.tags + [str(num_event)])[0],)] for sngl in single_triggers: - # Find the triggers close to this injection at this IFO + # Find the triggers close to _this_ injection at this IFO ifo = sngl.ifo trig_tdiff = abs(inj_params[ifo + '_end_time'] - trigger_times[ifo]) nearby = trig_tdiff < args.nearby_triggers_window @@ -315,7 +317,7 @@ for num_event in range(num_events): continue # Find the loudest SNR in this window loudest = numpy.argmax(trigger_snrs[ifo][nearby]) - # Convert to the indexin the trigger file + # Convert to the index in the trigger file nearby_trigger_idx = trigger_idx[ifo][nearby][loudest] # Make the info snippet sngl_info = mini.make_sngl_ifo(workflow, sngl, tmpltbank_file, @@ -381,19 +383,31 @@ for num_event in range(num_events): use_exact_inj_params=True) for curr_ifo in args.single_detector_triggers: + # Finding loudest template in this detector near to the injection: + # First, find triggers close to the missed injection single_fname = args.single_detector_triggers[curr_ifo] - hd_sngl = SingleDetTriggers(single_fname, args.bank_file, None, None, - None, curr_ifo) - end_times = hd_sngl.end_time - # Use SNR here or NewSNR?? - snr = hd_sngl.snr - lgc_mask = abs(end_times - inj_params['tc']) < args.inj_window - - if len(snr[lgc_mask]) == 0: + idx, _ = HFile(single_fname).select( + lambda t: abs(t - inj_params['tc']) < args.inj_window, + f'{curr_ifo}/end_time', + return_data=False, + ) + + if len(idx) == 0: + # No events in the injection window for this detector continue - snr_idx = numpy.arange(len(lgc_mask))[lgc_mask][snr[lgc_mask].argmax()] - hd_sngl.mask = [snr_idx] + hd_sngl = SingleDetTriggers( + single_fname, + curr_ifo, + bank_file=args.bank_file, + premask=idx + ) + # Next, find the loudest within this set of triggers + # Use SNR here or NewSNR, or other?? + loudest_idx = hd_sngl.snr.argmax() + hd_sngl.apply_mask([loudest_idx]) + + # What are the parameters of this trigger? curr_params = copy.deepcopy(inj_params) curr_params['mass1'] = hd_sngl.mass1[0] curr_params['mass2'] = hd_sngl.mass2[0] diff --git a/bin/minifollowups/pycbc_page_snglinfo b/bin/minifollowups/pycbc_page_snglinfo index eb248daecd1..5d741e66946 100644 --- a/bin/minifollowups/pycbc_page_snglinfo +++ b/bin/minifollowups/pycbc_page_snglinfo @@ -21,7 +21,7 @@ matplotlib.use('Agg') import lal import pycbc.version, pycbc.events, pycbc.results, pycbc.pnutils from pycbc.results import followup -from pycbc.events import stat +from pycbc.events import stat as pystat from pycbc.io import hdf @@ -62,7 +62,7 @@ parser.add_argument('--include-gracedb-link', action='store_true', parser.add_argument('--significance-file', help="If given, will search for this trigger's id in the file to see if " "stat and p_astro values exists for this trigger.") -stat.insert_statistic_option_group(parser, +pystat.insert_statistic_option_group(parser, default_ranking_statistic='single_ranking_only') @@ -70,29 +70,45 @@ args = parser.parse_args() pycbc.init_logging(args.verbose) +if args.trigger_id is not None: + # Make a mask which is just the trigger of interest + mask = numpy.array([args.trigger_id]) +else: + mask = None + # Get the single-ifo triggers -sngl_file = hdf.SingleDetTriggers(args.single_trigger_file, args.bank_file, - args.veto_file, args.veto_segment_name, None, args.instrument) +sngl_file = hdf.SingleDetTriggers( + args.single_trigger_file, + args.instrument, + bank_file=args.bank_file, + veto_file=args.veto_file, + segment_name=args.veto_segment_name, + premask=mask, +) + +rank_method = pystat.get_statistic_from_opts(args, [args.instrument]) -rank_method = stat.get_statistic_from_opts(args, [args.instrument]) if args.trigger_id is not None: - sngl_file.mask = [args.trigger_id] - trig_id = args.trigger_id - sngl_file.mask_to_n_loudest_clustered_events(rank_method, n_loudest=1) - stat = sngl_file.stat[0] + # Mask already applied + pass elif args.n_loudest is not None: # Cluster by a ranking statistic and retain only the loudest n clustered # triggers - sngl_file.mask_to_n_loudest_clustered_events(rank_method, - n_loudest=args.n_loudest+1) + sngl_file.mask_to_n_loudest_clustered_events( + rank_method, + n_loudest=args.n_loudest+1 + ) +else: + raise ValueError("Must give --n-loudest or --trigger-id.") + +sds = rank_method.single(sngl_file.trig_dict()) +stat = rank_method.rank_stat_single((sngl_file.ifo, sds)) +if args.n_loudest is not None: # Restrict to only the nth loudest, instead of all triggers up to nth # loudest - stat = sngl_file.stat l = stat.argsort() stat = stat[l[0]] - sngl_file.mask = sngl_file.mask[l[0]] -else: - raise ValueError("Must give --n-loudest or --trigger-id.") + sngl_file.apply_mask(l[0]) # make a table for the single detector information ############################ time = sngl_file.end_time @@ -142,7 +158,9 @@ if args.ranking_statistic in ["quadsum", "single_ranking_only"]: else: # Name would be too long - just call it ranking statistic stat_name = 'Ranking Statistic' - stat_name_long = ' with '.join([args.ranking_statistic, args.sngl_ranking]) + stat_name_long = ' with '.join( + [args.ranking_statistic, args.sngl_ranking] + ) headers.append(stat_name) diff --git a/bin/minifollowups/pycbc_plot_trigger_timeseries b/bin/minifollowups/pycbc_plot_trigger_timeseries index ef9829dab01..933870143a4 100644 --- a/bin/minifollowups/pycbc_plot_trigger_timeseries +++ b/bin/minifollowups/pycbc_plot_trigger_timeseries @@ -61,9 +61,13 @@ for ifo in args.single_trigger_files.keys(): # Identify trigger idxs within window of trigger time with HFile(args.single_trigger_files[ifo], 'r') as data: - idx, _ = data.select(lambda endtime: abs(endtime - t) < args.window, - f'{ifo}/end_time', return_indices=True) - data_mask = numpy.zeros(len(data[f'{ifo}/snr']), dtype=bool) + idx, _ = data.select( + lambda endtime: abs(endtime - t) < args.window, + 'end_time', + group=ifo, + return_data=False, + ) + data_mask = numpy.zeros(data[ifo]['snr'].size, dtype=bool) data_mask[idx] = True if not len(idx): @@ -75,13 +79,8 @@ for ifo in args.single_trigger_files.keys(): label=ifo) continue - # Not using bank file, or veto file, so lots of "None"s here. trigs = SingleDetTriggers( args.single_trigger_files[ifo], - None, - None, - None, - None, ifo, premask=data_mask ) diff --git a/bin/minifollowups/pycbc_sngl_minifollowup b/bin/minifollowups/pycbc_sngl_minifollowup index 01fbc743e83..5515e87a930 100644 --- a/bin/minifollowups/pycbc_sngl_minifollowup +++ b/bin/minifollowups/pycbc_sngl_minifollowup @@ -65,8 +65,9 @@ parser.add_argument('--inspiral-data-read-name', parser.add_argument('--inspiral-data-analyzed-name', help="Name of inspiral segmentlist containing data " "analyzed by each analysis job.") -parser.add_argument('--min-snr', type=float, default=6.5, - help="Minimum SNR to consider for loudest triggers") +parser.add_argument('--min-sngl-ranking', type=float, default=6.5, + help="Minimum sngl-ranking to consider for loudest " + "triggers. Default=6.5.") parser.add_argument('--non-coinc-time-only', action='store_true', help="If given remove (veto) single-detector triggers " "that occur during a time when at least one other " @@ -130,31 +131,14 @@ insp_data_seglists.coalesce() num_events = int(workflow.cp.get_opt_tags('workflow-sngl_minifollowups', 'num-sngl-events', '')) -# This helps speed up the processing to ignore a large fraction of triggers -mask = None -f = hdf.HFile(args.single_detector_file, 'r') -n_triggers = f['{}/snr'.format(args.instrument)].size -logging.info("%i triggers in file", n_triggers) -if args.min_snr: - logging.info('Calculating Prefilter') - idx, _ = f.select(lambda snr: snr > args.min_snr, - '{}/snr'.format(args.instrument), - return_indices=True) - mask = numpy.zeros(n_triggers, dtype=bool) - mask[idx] = True - if len(idx) < num_events: - logging.info("Fewer triggers exist after the --min-snr cut (%d) " - "than requested for the minifollowup (%d)", - len(idx), num_events) - trigs = hdf.SingleDetTriggers( args.single_detector_file, - args.bank_file, - args.foreground_censor_file, - args.foreground_segment_name, - None, args.instrument, - premask=mask + bank_file=args.bank_file, + veto_file=args.foreground_censor_file, + segment_name=args.foreground_segment_name, + filter_rank=args.sngl_ranking, + filter_threshold=args.min_sngl_ranking ) # Include gating vetoes diff --git a/bin/plotting/pycbc_plot_hist b/bin/plotting/pycbc_plot_hist index 4e76b69bb47..b252c82b430 100644 --- a/bin/plotting/pycbc_plot_hist +++ b/bin/plotting/pycbc_plot_hist @@ -7,7 +7,7 @@ import pycbc.version, pycbc.results, pycbc.io from itertools import cycle from matplotlib import use; use('Agg') from matplotlib import pyplot -from pycbc.events import background_bin_from_string, veto +from pycbc.events import background_bin_from_string, veto, ranking parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('--version', action='version', version=pycbc.version.git_verbose_msg) @@ -46,10 +46,21 @@ pycbc.init_logging(args.verbose) f = h5py.File(args.trigger_file, 'r') ifo = tuple(f.keys())[0] +# Apply presort if we can; if we can't, load everything but it may be +# inefficient +xv = args.x_var if args.x_var in ranking.sngls_ranking_function_dict else None +xt = args.x_min if args.x_var in ranking.sngls_ranking_function_dict else None + # read single-detector triggers -trigs = pycbc.io.SingleDetTriggers(args.trigger_file, args.bank_file, - args.veto_file, args.segment_name, - None, ifo) +trigs = pycbc.io.SingleDetTriggers( + args.trigger_file, + ifo, + bank_file=args.bank_file, + veto_file=args.veto_file, + segment_name=args.segment_name, + filter_rank=xv, + filter_threshold=xt, +) # get x values and find the maximum x value val = getattr(trigs, args.x_var) diff --git a/bin/plotting/pycbc_plot_singles_timefreq b/bin/plotting/pycbc_plot_singles_timefreq index 20f480240f9..e8d12e9575a 100644 --- a/bin/plotting/pycbc_plot_singles_timefreq +++ b/bin/plotting/pycbc_plot_singles_timefreq @@ -105,13 +105,17 @@ def rough_filter(snr, chisq, chisq_dof, end_time, tmp_id, tmp_dur): return np.logical_and(end_time > opts.gps_start_time, end_time < opts.gps_end_time + tmp_dur) -indices, snr, chisq, chisq_dof, end_time, template_ids, template_duration = \ - trig_f.select(rough_filter, opts.detector + '/snr', - opts.detector + '/chisq', opts.detector + '/chisq_dof', - opts.detector + '/end_time', - opts.detector + '/template_id', - opts.detector + '/template_duration', - return_indices=True) +indices, data_tuple = trig_f.select( + rough_filter, + 'snr', + 'chisq', + 'chisq_dof', + 'end_time', + 'template_id', + 'template_duration', + group=opts.detector +) +snr, chisq, chisq_dof, end_time, template_ids, template_duration = data_tuple if len(indices) > 0: if opts.veto_file: diff --git a/bin/plotting/pycbc_plot_singles_vs_params b/bin/plotting/pycbc_plot_singles_vs_params index c834037a30d..f588689ddd4 100644 --- a/bin/plotting/pycbc_plot_singles_vs_params +++ b/bin/plotting/pycbc_plot_singles_vs_params @@ -52,8 +52,6 @@ parser.add_argument('--segment-name', default=None, type=str, help='Optional, name of segment list to use for vetoes') parser.add_argument('--filter-string', default=None, type=str, help='Optional, boolean expression for filtering triggers') -parser.add_argument('--min-snr', default=0., type=float, - help='Only plot triggers above the given SNR') parser.add_argument('--output-file', type=str, required=True, help='Destination path for plot') parser.add_argument('--x-var', required=True, @@ -84,30 +82,27 @@ opts = parser.parse_args() logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) -data_mask = None -if opts.min_snr > 0: - with pycbc.io.HFile(opts.single_trig_file, 'r') as trig_file: - n_triggers_orig = trig_file[f'{opts.detector}/snr'].size - logging.info("Trigger file has %d triggers", n_triggers_orig) - logging.info('Generating trigger mask (on SNR)') - idx, _ = trig_file.select( - lambda snr: snr >= opts.min_snr, - f'{opts.detector}/snr', - return_indices=True - ) - data_mask = np.zeros(n_triggers_orig, dtype=bool) - data_mask[idx] = True +if opts.z_var == 'density' or opts.min_z is None: + filter_rank = None + filter_thresh = None +else: + # We can apply a/another filter on the z value + filter_rank = opts.z_var + filter_thresh = opts.min_z trigs = pycbc.io.SingleDetTriggers( opts.single_trig_file, - opts.bank_file, - opts.veto_file, - opts.segment_name, - opts.filter_string, opts.detector, - premask=data_mask + bank_file=opts.bank_file, + veto_file=opts.veto_file, + segment_name=opts.segment_name, + filter_func=opts.filter_string, + filter_rank=filter_rank, + filter_threshold=filter_thresh ) +# Can this be folded into the SingleDetTriggers call? Maybe, but we can +# do this later if needed x = getattr(trigs, opts.x_var) y = getattr(trigs, opts.y_var) @@ -123,6 +118,29 @@ if opts.max_y is not None: x = x[mask] y = y[mask] +title = f'{opts.z_var.title()} of {opts.detector} triggers ' + \ + f'over {opts.x_var.title()} and {opts.y_var.title()}' +fig_caption = f"This plot shows the {opts.z_var} of single detector " + \ + f"triggers for the {opts.detector} detector. " + \ + f"{opts.z_var.title()} is shown on the colorbar axis " + \ + f"against {opts.x_var} and {opts.y_var} on the x- and y-axes." + +if not any(mask): + # All triggers removed - make a blank plot which says so: + fig = pl.figure() + ax = fig.gca() + pl.text(0.5, 0.5, 'no triggers in the range') + + pycbc.results.save_fig_with_metadata( + fig, + opts.output_file, + title=title, + caption=fig_caption, + cmd=' '.join(sys.argv), + fig_kwds={'dpi': 200} + ) + sys.exit() + hexbin_style = { 'gridsize': opts.grid_size, 'linewidths': 0.03 @@ -145,7 +163,6 @@ logging.info('Plotting') fig = pl.figure() ax = fig.gca() - if opts.z_var == 'density': hb = ax.hexbin(x, y, **hexbin_style) fig.colorbar(hb, ticks=LogLocator(subs=range(10))) @@ -166,12 +183,6 @@ else: ax.set_xlabel(opts.x_var) ax.set_ylabel(opts.y_var) ax.set_title(opts.z_var.title() + ' of %s triggers ' % (opts.detector)) -title = '%s of %s triggers over %s and %s' % (opts.z_var.title(), - opts.detector, opts.x_var.title(), opts.y_var.title()) -fig_caption = ("This plot shows the %s of single detector triggers for the %s " - "detector. %s is shown on the colorbar axis against %s and %s " - "on the x- and y-axes." % (opts.z_var, opts.detector, - opts.z_var.title(), opts.x_var, opts.y_var)) pycbc.results.save_fig_with_metadata(fig, opts.output_file, title=title, caption=fig_caption, cmd=' '.join(sys.argv), fig_kwds={'dpi': 200}) diff --git a/bin/pycbc_fit_sngl_trigs b/bin/pycbc_fit_sngl_trigs index 79af37fe680..cef76488606 100644 --- a/bin/pycbc_fit_sngl_trigs +++ b/bin/pycbc_fit_sngl_trigs @@ -204,9 +204,14 @@ for ifo in opt.ifos: elif trigformat == "hdf": if opt.bank_file: # io function can currently only handle 1 file - sngls = io.hdf.SingleDetTriggers(opt.inputs[0], - opt.bank_file, opt.veto_file, opt.veto_segment_name, - filter_func, ifo) + sngls = io.hdf.SingleDetTriggers( + opt.inputs[0], + ifo, + bank_file=opt.bank_file, + veto_file=opt.veto_file, + segment_name=opt.veto_segment_name, + filter_func=filter_func, + ) done_vetoing = True else: sngls = io.hdf.DataFromFiles(opt.inputs, group=ifo, @@ -236,10 +241,10 @@ for ifo in opt.ifos: #rchisq = np.maximum(rchisq, bank_rchisq) if opt.veto_file and not done_vetoing: - logging.info("Applying vetoes from %s for ifo %s" % + logging.info("Applying vetoes from %s for ifo %s" % (opt.veto_file, ifo)) keep_idx, segs = events.veto.indices_outside_segments( - sngls.get_column("end_time").astype(int), + sngls.get_column("end_time").astype(int), [opt.veto_file], ifo=ifo, segment_name=opt.veto_segment_name) snr = snr[keep_idx] @@ -304,7 +309,7 @@ for ifo in opt.ifos: else: mass1_inbin = mass1[pind == i] mass2_inbin = mass2[pind == i] - mass_tuples = [(m1, m2) for m1, m2 in + mass_tuples = [(m1, m2) for m1, m2 in zip(mass1_inbin, mass2_inbin)] numtmpl = len(set(mass_tuples)) templates[ifo][i] = numtmpl @@ -326,7 +331,7 @@ for ifo in opt.ifos: stdev[ifo][th][i] = sig_alpha _, ks_prob[ifo][th][i] = trstats.KS_test( opt.fit_function, vals_inbin, alpha, th) - outfile.write("%s %.2f %.3g %.3g %d %d %.3f %.3f %.3g\n" % + outfile.write("%s %.2f %.3g %.3g %d %d %.3f %.3f %.3g\n" % (ifo, th, lower, upper, numtmpl, counts[ifo][th][i], alpha, sig_alpha, ks_prob[ifo][th][i])) @@ -335,7 +340,7 @@ for ifo in opt.ifos: histcounts, edges = np.histogram(vals_inbin, bins=50) cum_counts = histcounts[::-1].cumsum()[::-1] binlabel = r"%.2g - %.2g" % (lower, upper) - plt.semilogy(edges[:-1], cum_counts, linewidth=2, + plt.semilogy(edges[:-1], cum_counts, linewidth=2, color=histcolors[i], label=binlabel, alpha=0.6) plt.semilogy(plotrange, counts[ifo][th][i] * \ @@ -348,7 +353,7 @@ for ifo in opt.ifos: plt.semilogy(plotrange, counts[ifo][th][i] * \ trstats.cum_fit(opt.fit_function, plotrange, alpha - \ sig_alpha, th), ":", alpha=0.6, color=histcolors[i]) - + if opt.plot_dir: leg = plt.legend(labelspacing=0.2) plt.setp(leg.get_texts(), fontsize=11) @@ -370,7 +375,7 @@ if opt.plot_dir: for ifo in fits.keys(): for th in opt.stat_threshold: plt.errorbar(pbins.centres(), [fits[ifo][th][i] for i in binind], - yerr=[stdev[ifo][th][i] for i in binind], fmt="+-", + yerr=[stdev[ifo][th][i] for i in binind], fmt="+-", label=ifo + " fit above %.2f" % th) if opt.bin_spacing == "log": plt.semilogx() plt.grid() @@ -381,7 +386,7 @@ if opt.plot_dir: plt.close() for th in opt.stat_threshold: - plt.plot(pbins.centres(), [ks_prob[ifo][th][i] for i in binind], + plt.plot(pbins.centres(), [ks_prob[ifo][th][i] for i in binind], '+--', label=ifo+' KS prob, thresh %.2f' % th) if opt.bin_spacing == 'log': plt.loglog() else : plt.semilogy() diff --git a/bin/pycbc_process_sngls b/bin/pycbc_process_sngls index 4174a72b9a8..c740415ecdd 100644 --- a/bin/pycbc_process_sngls +++ b/bin/pycbc_process_sngls @@ -57,9 +57,14 @@ else: if filter_func is not None: logging.info('Will filter trigs using %s', filter_func) # Filter will be stored as self.mask attribute of sngls instance -sngls = SingleDetTriggers(args.single_trig_file, args.bank_file, - args.veto_file, args.segment_name, filter_func, - args.detector) +sngls = SingleDetTriggers( + args.single_trig_file, + args.detector, + bank_file=args.bank_file, + veto_file=args.veto_file, + segment_name=args.segment_name, + filter_func=filter_func, +) logging.info('Calculating stat') rank_method = stat.get_statistic_from_opts(args, [args.detector]) diff --git a/docs/hwinj.rst b/docs/hwinj.rst index 11da93a62c7..5072b23ab6a 100644 --- a/docs/hwinj.rst +++ b/docs/hwinj.rst @@ -202,7 +202,7 @@ Where ``${START}`` is the start of the injection. We kept the same PSD options ( You can print out the recovered SNR and other parameters as follows :: echo `python -c "import numpy;from pycbc.io.hdf import SingleDetTriggers; \ - h1_triggers=SingleDetTriggers('${INSPIRAL_FILE}',None, None, None, None, 'H1'); \ + h1_triggers=SingleDetTriggers('${INSPIRAL_FILE}', 'H1'); \ imax=numpy.argmax(h1_triggers.snr); max_snr=h1_triggers.snr[imax]; \ time=h1_triggers.end_time[imax]; print(time, max_snr)"` diff --git a/examples/search/plotting.ini b/examples/search/plotting.ini index 7b3c92496e4..0b1ab2a2cbe 100644 --- a/examples/search/plotting.ini +++ b/examples/search/plotting.ini @@ -164,14 +164,13 @@ gradient-far = missed-on-top = [plot_singles] -min-snr = 6 [plot_singles-mtotal_eta_newsnr] x-var = mtotal log-x = y-var = eta z-var = "newsnr_sgveto" -min-z = 6 +min-z = 5.5 [plot_range] diff --git a/pycbc/events/ranking.py b/pycbc/events/ranking.py index e06a0f14ac0..5da85d01019 100644 --- a/pycbc/events/ranking.py +++ b/pycbc/events/ranking.py @@ -299,6 +299,21 @@ def get_newsnr_sgveto_psdvar_scaled_threshold(trigs): 'newsnr_sgveto_psdvar_scaled_threshold': get_newsnr_sgveto_psdvar_scaled_threshold, } +# Lists of datasets required in the trigs object for each function +required_datasets = {} +required_datasets['snr'] = ['snr'] +required_datasets['newsnr'] = required_datasets['snr'] + ['chisq', 'chisq_dof'] +required_datasets['new_snr'] = required_datasets['newsnr'] +required_datasets['newsnr_sgveto'] = required_datasets['newsnr'] + ['sg_chisq'] +required_datasets['newsnr_sgveto_psdvar'] = \ + required_datasets['newsnr_sgveto'] + ['psd_var_val'] +required_datasets['newsnr_sgveto_psdvar_threshold'] = \ + required_datasets['newsnr_sgveto_psdvar'] +required_datasets['newsnr_sgveto_psdvar_scaled'] = \ + required_datasets['newsnr_sgveto_psdvar'] +required_datasets['newsnr_sgveto_psdvar_scaled_threshold'] = \ + required_datasets['newsnr_sgveto_psdvar'] + def get_sngls_ranking_from_trigs(trigs, statname, **kwargs): """ @@ -309,7 +324,7 @@ def get_sngls_ranking_from_trigs(trigs, statname, **kwargs): Parameters ----------- - trigs: dict of numpy.ndarrays + trigs: dict of numpy.ndarrays, SingleDetTriggers or ReadByTemplate Dictionary holding single detector trigger information. statname: The statistic to use. diff --git a/pycbc/io/hdf.py b/pycbc/io/hdf.py index 754c63bf663..9bccc92536c 100644 --- a/pycbc/io/hdf.py +++ b/pycbc/io/hdf.py @@ -29,7 +29,8 @@ class HFile(h5py.File): """ Low level extensions to the capabilities of reading an hdf5 File """ - def select(self, fcn, *args, **kwds): + def select(self, fcn, *args, chunksize=10**6, derived=None, group='', + return_data=True, premask=None): """ Return arrays from an hdf5 file that satisfy the given function Parameters @@ -42,66 +43,137 @@ def select(self, fcn, *args, **kwds): A variable number of strings that are keys into the hdf5. These must refer to arrays of equal length. - chunksize : {1e6, int}, optional + chunksize : {10**6, int}, optional Number of elements to read and process at a time. - return_indices : bool, optional - If True, also return the indices of elements passing the function. + derived : dictionary + Dictionary keyed on argument name (must be given in args), values + are a tuple of: the function to be computed, and the required + datasets. The function must take in a dictionary keyed on those + dataset names. + + group : string, optional + The group within the h5py file containing the datasets, e.g. in + standard offline merged trigger files, this would be the IFO. This + can be included in the args manually, but is required in the case + of derived functions, e.g. newsnr. + + return_data : bool, optional, default True + If True, return the data for elements passing the function. + + premask : array of boolean values, optional + The pre-mask to apply to the triggers at read-in. Returns ------- - values : np.ndarrays - A variable number of arrays depending on the number of keys into - the hdf5 file that are given. If return_indices is True, the first - element is an array of indices of elements passing the function. + indices: np.ndarray + An array of indices of elements passing the function. + + return_tuple : tuple of np.ndarrays + A variable number of arrays depending on the number of + args provided, + If return_data is True, arrays are the values of each + arg. + If return_data is False, this is None. >>> f = HFile(filename) >>> snr = f.select(lambda snr: snr > 6, 'H1/snr') """ - # get references to each array + # Required datasets are the arguments requested and datasets given + # for any derived functions + derived = derived if derived is not None else {} + dsets = [a for a in list(args) if a not in derived] + for _, rqd_list in derived.values(): + dsets += rqd_list + + # remove any duplicates from req_dsets + dsets = list(set(dsets)) + + # Get the pointers to the h5py Datasets, + # check they can all be used together refs = {} + size = None + for ds in dsets: + refs[ds] = self[group + '/' + ds] + if (size is not None) and (refs[ds].size != size): + raise RuntimeError(f"Dataset {ds} is {self[ds].size} " + "entries long, which does not match " + f"previous input datasets ({size}).") + size = refs[ds].size + + # Apply any pre-masks + if premask is None: + mask = np.ones(size, dtype=bool) + else: + mask = premask + + if not mask.dtype == bool: + # mask is an array of indices rather than booleans, + # make it a bool array + new_mask = np.zeros(size, dtype=bool) + new_mask[mask] = True + mask = new_mask + + if not mask.size == size: + raise RuntimeError(f"Using premask of size {mask.size} which " + f"does not match the input datasets ({size}).") + + # datasets being returned (possibly) data = {} + indices = np.array([], dtype=np.uint64) for arg in args: - refs[arg] = self[arg] data[arg] = [] - return_indices = kwds.get('return_indices', False) - indices = np.array([], dtype=np.uint64) - - # To conserve memory read the array in chunks - chunksize = kwds.get('chunksize', int(1e6)) - size = len(refs[arg]) - + # Loop through the chunks: i = 0 while i < size: r = i + chunksize if i + chunksize < size else size - # Read each chunk's worth of data and find where it passes the function - partial = [refs[arg][i:r] for arg in args] + if not any(mask[i:r]): + # Nothing allowed through the mask in this chunk + i += chunksize + continue + + if all(mask[i:r]): + # Everything allowed through the mask in this chunk + submask = np.arange(r - i) + else: + submask = np.flatnonzero(mask[i:r]) + + # Read each chunk's worth of data + partial_data = {arg: refs[arg][i:r][mask[i:r]] + for arg in dsets} + partial = [] + for a in args: + if a in derived.keys(): + # If this is a derived dataset, calculate it + derived_fcn = derived[a][0] + partial += [derived_fcn(partial_data)] + else: + # otherwise, just read from the file + partial += [partial_data[a]] + + # Find where it passes the function keep = fcn(*partial) - if return_indices: - indices = np.concatenate([indices, np.flatnonzero(keep) + i]) - # Store only the results that pass the function - for arg, part in zip(args, partial): - data[arg].append(part[keep]) + # Keep the indices which pass the function: + indices = np.concatenate([indices, submask[keep] + i]) + + if return_data: + # Store the dataset results that pass the function + for arg, part in zip(args, partial): + data[arg].append(part[keep]) i += chunksize - # Combine the partial results into full arrays - if len(args) == 1: - res = np.concatenate(data[args[0]]) - if return_indices: - return indices.astype(np.uint64), res - else: - return res + if return_data: + return_tuple = tuple(np.concatenate(data[arg]) + for arg in args) else: - res = tuple(np.concatenate(data[arg]) for arg in args) - if return_indices: - return (indices.astype(np.uint64),) + res - else: - return res + return_tuple = None + + return indices.astype(np.uint64), return_tuple class DictArray(object): @@ -402,47 +474,92 @@ class SingleDetTriggers(object): """ Provides easy access to the parameters of single-detector CBC triggers. """ - # FIXME: Some of these are optional and should be kwargs. - def __init__(self, trig_file, bank_file, veto_file, - segment_name, filter_func, detector, premask=None): + def __init__(self, trig_file, detector, bank_file=None, veto_file=None, + segment_name=None, premask=None, filter_rank=None, + filter_threshold=None, chunksize=10**6, filter_func=None): + """ + Create a SingleDetTriggers instance + + Parameters + ---------- + trig_file : string or os.pathtype, required + HDF file containing trigger information + + detector : string, required + The detectior being used, this is used to access the + triggers in trig_file + + bank_file: string or os.pathtype, optional + hdf file containing template bank information + + veto_file: string or os.pathtype, optional + File used to define vetoes + + segment_name : string, optional + Segment name being used in the veto_file + + premask : array of indices or boolean, optional + Array of used triggers + + filter_rank : string, optional + The ranking, as defined by ranking.py to compare to + filter_threshold + + filter_threshold: float, required if filter_rank is used + Threshold to filter the ranking values + + chunksize : int , default 10**6 + Size of chunks to read in for the filter_rank / threshold. + """ logging.info('Loading triggers') self.trigs_f = HFile(trig_file, 'r') self.trigs = self.trigs_f[detector] + self.ntriggers = self.trigs['end_time'].size self.ifo = detector # convenience attributes self.detector = detector if bank_file: logging.info('Loading bank') self.bank = HFile(bank_file, 'r') else: - logging.info('No bank file given to SingleDetTriggers') # empty dict in place of non-existent hdf file self.bank = {} - if premask is not None: - self.mask = premask - else: - self.mask = np.ones(len(self.trigs['end_time']), dtype=bool) - - if veto_file: - logging.info('Applying veto segments') - # veto_mask is an array of indices into the trigger arrays - # giving the surviving triggers - logging.info('%i triggers before vetoes', self.mask.sum()) - self.veto_mask, _ = events.veto.indices_outside_segments( - self.end_time, [veto_file], - ifo=detector, segment_name=segment_name) + # Apply some masks to start off with - here we should try and apply + # them in the order which cuts most things earliest. - idx = np.flatnonzero(self.mask)[self.veto_mask] - self.mask[:] = False - self.mask[idx] = True - logging.info('%i triggers remain after vetoes', - len(self.veto_mask)) + # Apply any pre-masks + if premask is None: + self.mask = np.ones(self.ntriggers, dtype=bool) + else: + self.mask = None + self.apply_mask(premask) + + if filter_rank: + assert filter_threshold is not None + logging.info("Applying threshold of %.3f on %s", + filter_threshold, filter_rank) + fcn_dsets = (ranking.sngls_ranking_function_dict[filter_rank], + ranking.required_datasets[filter_rank]) + idx, _ = self.trigs_f.select( + lambda rank: rank > filter_threshold, + filter_rank, + derived={filter_rank: fcn_dsets}, + return_data=False, + premask=self.mask, + group=detector, + chunksize=chunksize, + ) + logging.info("%d triggers remain", idx.size) + # If self.mask already has values, need to take these into account: + self.and_masks(idx) - # FIXME this should use the hfile select interface to avoid - # memory and processing limitations. if filter_func: - # get required columns into the namespace with dummy attribute - # names to avoid confusion with other class properties + # Apply a filter on the triggers which is _not_ a ranking statistic + for rank_str in ranking.sngls_ranking_function_dict.keys(): + if f'self.{rank_str}' in filter_func: + logging.warning('Supplying the ranking (%s) in ' + 'filter_func is inefficient, suggest to ' + 'use filter_rank instead.', rank_str) logging.info('Setting up filter function') for c in self.trigs.keys(): if c in filter_func: @@ -451,17 +568,31 @@ def __init__(self, trig_file, bank_file, veto_file, if c in filter_func: # get template parameters corresponding to triggers setattr(self, '_'+c, - np.array(self.bank[c])[self.trigs['template_id'][:]]) + np.array(self.bank[c])[self.trigs['template_id'][:]]) - self.filter_mask = eval(filter_func.replace('self.', 'self._')) + filter_mask = eval(filter_func.replace('self.', 'self._')) # remove the dummy attributes for c in chain(self.trigs.keys(), self.bank.keys()): if c in filter_func: delattr(self, '_'+c) - self.mask = self.mask & self.filter_mask + self.apply_mask(filter_mask) logging.info('%i triggers remain after cut on %s', sum(self.mask), filter_func) + if veto_file: + logging.info('Applying veto segments') + # veto_mask is an array of indices into the trigger arrays + # giving the surviving triggers + logging.info('%i triggers before vetoes', self.mask_size) + veto_mask, _ = events.veto.indices_outside_segments( + self.end_time, [veto_file], + ifo=detector, segment_name=segment_name) + + # Update mask accordingly + self.apply_mask(veto_mask) + logging.info('%i triggers remain after vetoes', + self.mask_size) + def __getitem__(self, key): # Is key in the TRIGGER_MERGE file? try: @@ -483,7 +614,7 @@ def checkbank(self, param): % param) def trig_dict(self): - """Returns dict of the masked trigger valuse """ + """Returns dict of the masked trigger values""" mtrigs = {} for k in self.trigs: if len(self.trigs[k]) == len(self.trigs['end_time']): @@ -500,54 +631,115 @@ def get_param_names(cls): if type(m[1]) == property] def apply_mask(self, logic_mask): - """Apply a boolean array to the set of triggers""" - if hasattr(self.mask, 'dtype') and (self.mask.dtype == 'bool'): - orig_indices = self.mask.nonzero()[0][logic_mask] + """Apply a mask over the top of the current mask + + Parameters + ---------- + logic_mask : boolean array or numpy array of indices + """ + if self.mask is None: + self.mask = np.zeros(self.ntriggers, dtype=bool) + self.mask[logic_mask] = True + elif hasattr(self.mask, 'dtype') and (self.mask.dtype == 'bool'): + orig_indices = np.flatnonzero(self.mask)[logic_mask] self.mask[:] = False self.mask[orig_indices] = True else: self.mask = list(np.array(self.mask)[logic_mask]) + def and_masks(self, logic_mask): + """Apply a mask to be combined as a logical and with the current mask. + + Parameters + ---------- + logic_mask : boolean array or numpy array/list of indices + """ + if self.mask_size == self.ntriggers: + # No mask exists, just update to use the given mask + self.apply_mask(logic_mask) + return + + # Use intersection of the indices of True values in the masks + if hasattr(logic_mask, 'dtype') and (logic_mask.dtype == 'bool'): + new_indices = np.flatnonzero(logic_mask) + else: + new_indices = np.array(logic_mask) + + if hasattr(self.mask, 'dtype') and (self.mask.dtype == 'bool'): + orig_indices = np.flatnonzero(self.mask) + else: + orig_indices = np.array(self.mask) + + self.mask[:] = False + and_indices = np.intersect1d(new_indices, orig_indices) + self.mask[and_indices.astype(np.uint64)] = True + def mask_to_n_loudest_clustered_events(self, rank_method, + ranking_threshold=6, n_loudest=10, cluster_window=10): """Edits the mask property of the class to point to the N loudest - single detector events as ranked by ranking statistic. Events are - clustered so that no more than 1 event within +/- cluster-window will - be considered.""" + single detector events as ranked by ranking statistic. + + Events are clustered so that no more than 1 event within +/- + cluster_window will be considered. Can apply a threshold on the + ranking using ranking_threshold + """ - # If this becomes memory intensive we can optimize sds = rank_method.single(self.trig_dict()) stat = rank_method.rank_stat_single((self.ifo, sds)) if len(stat) == 0: - # No triggers, so just return here - self.stat = np.array([]) + # No triggers at all, so just return here + self.apply_mask(np.array([], dtype=np.uint64)) return times = self.end_time + if ranking_threshold: + # Threshold on sngl_ranking + # Note that we can provide None or zero to do no thresholding + # but the default is to do some + keep = stat >= ranking_threshold + stat = stat[keep] + times = times[keep] + self.apply_mask(keep) + + if len(stat) == 0: + logging.warning("No triggers after thresholding") + return + else: + logging.info("%d triggers after thresholding", len(stat)) + index = stat.argsort()[::-1] new_times = [] new_index = [] + # Loop through triggers - loudest first for curr_idx in index: curr_time = times[curr_idx] for time in new_times: + # Have we already got a louder trigger within the window? if abs(curr_time - time) < cluster_window: break else: - # Only get here if no other triggers within cluster window + # Store if no other triggers within cluster window new_index.append(curr_idx) new_times.append(curr_time) if len(new_index) >= n_loudest: + # We have as many triggers as we want now break + # For indexing, indices need to be a numpy array, in order index = np.array(new_index) index.sort() - self.stat = stat[index] - if hasattr(self.mask, 'dtype') and self.mask.dtype == 'bool': - orig_indices = np.flatnonzero(self.mask)[index] - self.mask = list(orig_indices) - elif isinstance(self.mask, list): - self.mask = list(np.array(self.mask)[index]) + # Apply to the existing mask + self.apply_mask(index) + + @property + def mask_size(self): + if self.mask is None: + return self.ntriggers + if isinstance(self.mask, list): + return len(self.mask) + return np.count_nonzero(self.mask) @property def template_id(self): @@ -690,14 +882,17 @@ def get_ranking(self, rank_name, **kwargs): return ranking.get_sngls_ranking_from_trigs(self, rank_name, **kwargs) def get_column(self, cname): + """ + Read columns while applying the mask + """ # Fiducial value that seems to work, not extensively tuned. MFRAC = 0.3 # If the mask accesses few enough elements then directly use it # This can be slower than reading in all the elements if most of them # will be read. - if self.mask is not None and (isinstance(self.mask, list) or \ - (len(self.mask.nonzero()[0]) < (len(self.mask) * MFRAC))): + if isinstance(self.mask, list) or \ + self.mask_size < (self.ntriggers * MFRAC): return self.trigs[cname][self.mask] # We have a lot of elements to read so we resort to readin the entire