diff --git a/pycbc/events/threshold_cupy.py b/pycbc/events/threshold_cupy.py new file mode 100644 index 00000000000..c20fe18aa13 --- /dev/null +++ b/pycbc/events/threshold_cupy.py @@ -0,0 +1,259 @@ +# Copyright (C) 2012 Alex Nitz +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 3 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + +# +# ============================================================================= +# +# Preamble +# +# ============================================================================= +# + +import cupy as cp +import functools +import mako.template +from .eventmgr import _BaseThresholdCluster +import pycbc.scheme + +val = None +loc = None + +# https://stackoverflow.com/questions/77798014/cupy-rawkernel-cuda-error-not-found-named-symbol-not-found-cupy + +tkernel1 = mako.template.Template(""" +extern "C" __global__ void threshold_and_cluster(float2* in, float2* outv, int* outl, int window, float threshold){ + int s = window * blockIdx.x; + int e = s + window; + + // shared memory for chuck size candidates + __shared__ float svr[${chunk}]; + __shared__ float svi[${chunk}]; + __shared__ int sl[${chunk}]; + + // shared memory for the warp size candidates + __shared__ float svv[32]; + __shared__ int idx[32]; + + int ml = -1; + float mvr = 0; + float mvi = 0; + float re; + float im; + + // Iterate trought the entire window size chunk and find blockDim.x number + // of candidates + for (int i = s + threadIdx.x; i < e; i += blockDim.x){ + re = in[i].x; + im = in[i].y; + if ((re * re + im * im) > (mvr * mvr + mvi * mvi)){ + mvr = re; + mvi = im; + ml = i; + } + } + + // Save the candidate from this thread to shared memory + svr[threadIdx.x] = mvr; + svi[threadIdx.x] = mvi; + sl[threadIdx.x] = ml; + + __syncthreads(); + + if (threadIdx.x < 32){ + int tl = threadIdx.x; + + // Now that we have all the candiates for this chunk in shared memory + // Iterate through in the warp size to reduce to 32 candidates + for (int i = threadIdx.x; i < ${chunk}; i += 32){ + re = svr[i]; + im = svi[i]; + if ((re * re + im * im) > (mvr * mvr + mvi * mvi)){ + tl = i; + mvr = re; + mvi = im; + } + } + + // Store the 32 candidates into shared memory + svv[threadIdx.x] = svr[tl] * svr[tl] + svi[tl] * svi[tl]; + idx[threadIdx.x] = tl; + + // Find the 1 candidate we are looking for using a manual log algorithm + if ((threadIdx.x < 16) && (svv[threadIdx.x] < svv[threadIdx.x + 16])){ + svv[threadIdx.x] = svv[threadIdx.x + 16]; + idx[threadIdx.x] = idx[threadIdx.x + 16]; + } + + if ((threadIdx.x < 8) && (svv[threadIdx.x] < svv[threadIdx.x + 8])){ + svv[threadIdx.x] = svv[threadIdx.x + 8]; + idx[threadIdx.x] = idx[threadIdx.x + 8]; + } + + if ((threadIdx.x < 4) && (svv[threadIdx.x] < svv[threadIdx.x + 4])){ + svv[threadIdx.x] = svv[threadIdx.x + 4]; + idx[threadIdx.x] = idx[threadIdx.x + 4]; + } + + if ((threadIdx.x < 2) && (svv[threadIdx.x] < svv[threadIdx.x + 2])){ + svv[threadIdx.x] = svv[threadIdx.x + 2]; + idx[threadIdx.x] = idx[threadIdx.x + 2]; + } + + + // Save the 1 candidate maximum and location to the output vectors + if (threadIdx.x == 0){ + if (svv[threadIdx.x] < svv[threadIdx.x + 1]){ + idx[0] = idx[1]; + svv[0] = svv[1]; + } + + if (svv[0] > threshold){ + tl = idx[0]; + outv[blockIdx.x].x = svr[tl]; + outv[blockIdx.x].y = svi[tl]; + outl[blockIdx.x] = sl[tl]; + } else{ + outl[blockIdx.x] = -1; + } + } + } +} +""") + +tkernel2 = mako.template.Template(""" +extern "C" __global__ void threshold_and_cluster2(float2* outv, int* outl, float threshold, int window){ + __shared__ int loc[${blocks}]; + __shared__ float val[${blocks}]; + + int i = threadIdx.x; + + int l = outl[i]; + loc[i] = l; + + if (l == -1) + return; + + val[i] = outv[i].x * outv[i].x + outv[i].y * outv[i].y; + + + // Check right + if ( (i < (${blocks} - 1)) && (val[i + 1] > val[i]) ){ + outl[i] = -1; + return; + } + + // Check left + if ( (i > 0) && (val[i - 1] > val[i]) ){ + outl[i] = -1; + return; + } +} +""") + +@functools.lru_cache(maxsize=None) +def get_tkernel(slen, window): + if window < 32: + raise ValueError("GPU threshold kernel does not support a window smaller than 32 samples") + + elif window <= 4096: + nt = 128 + elif window <= 16384: + nt = 256 + elif window <= 32768: + nt = 512 + else: + nt = 1024 + + nb = int(cp.ceil(slen / float(window))) + + if nb > 1024: + raise ValueError("More than 1024 blocks not supported yet") + + fn = cp.RawKernel( + tkernel1.render(chunk=nt), + 'threshold_and_cluster', + backend='nvcc' + ) + fn2 = cp.RawKernel( + tkernel2.render(blocks=nb), + 'threshold_and_cluster2', + backend='nvcc' + ) + return (fn, fn2), nt, nb + +def threshold_and_cluster(series, threshold, window): + global val + global loc + if val is None: + val = cp.zeros(4096*256, dtype=cp.complex64) + if loc is None: + loc = cp.zeros(4096*256, cp.int32) + + outl = loc + outv = val + slen = len(series) + series = series.data + (fn, fn2), nt, nb = get_tkernel(slen, window) + threshold = cp.float32(threshold * threshold) + window = cp.int32(window) + + cl = loc[0:nb] + cv = val[0:nb] + + fn((nb,), (nt,), (series.data, outv, outl, window, threshold)) + fn2((1,), (nb,), (outv, outl, threshold, window)) + w = (cl != -1) + return cv[w], cl[w] + +class CUDAThresholdCluster(_BaseThresholdCluster): + def __init__(self, series): + self.series = series + + global val + global loc + if val is None: + val = cp.zeros(4096*256, dtype=cp.complex64) + if loc is None: + loc = cp.zeros(4096*256, cp.int32) + + self.outl = loc + self.outv = val + self.slen = len(series) + + def threshold_and_cluster(self, threshold, window): + threshold = cp.float32(threshold * threshold) + window = cp.int32(window) + + (fn, fn2), nt, nb = get_tkernel(self.slen, window) + cl = loc[0:nb] + cv = val[0:nb] + + fn( + (nt, 1, 1), + (nb, 1), + (self.series.data, self.outv, self.outl, window, threshold) + ) + fn2( + (nb, 1, 1), + (1, 1), + (self.outv, self.outl, threshold, window) + ) + w = (cl != -1) + return cp.asnumpy(cv[w]), cp.asnumpy(cl[w]) + +def _threshold_cluster_factory(series): + return CUDAThresholdCluster + diff --git a/pycbc/fft/backend_cupy.py b/pycbc/fft/backend_cupy.py new file mode 100644 index 00000000000..fec13ad746b --- /dev/null +++ b/pycbc/fft/backend_cupy.py @@ -0,0 +1,37 @@ +# Copyright (C) 2024 Y Ddraig Goch +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with with program; see the file COPYING. If not, write to the +# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, +# MA 02111-1307 USA + +from .core import _list_available + +_backend_dict = {'cupy' : 'cupyfft'} +_backend_list = ['cupy'] + +_alist, _adict = _list_available(_backend_list, _backend_dict) + +cupy_backend = None + +def set_backend(backend_list): + global cupy_backend + for backend in backend_list: + if backend in _alist: + cupy_backend = backend + break + +def get_backend(): + return _adict[cupy_backend] + +set_backend(_backend_list) diff --git a/pycbc/fft/backend_support.py b/pycbc/fft/backend_support.py index cae8c69eb19..ed1c76c7fe8 100644 --- a/pycbc/fft/backend_support.py +++ b/pycbc/fft/backend_support.py @@ -72,7 +72,7 @@ def get_backend(): # Import all scheme-dependent backends, to get _all_backends accurate: -for scheme_name in ["cpu", "mkl", "cuda"]: +for scheme_name in ["cpu", "mkl", "cuda", "cupy"]: try: mod = __import__('pycbc.fft.backend_' + scheme_name, fromlist = ['_alist', '_adict']) _alist = getattr(mod, "_alist") diff --git a/pycbc/fft/cupyfft.py b/pycbc/fft/cupyfft.py new file mode 100644 index 00000000000..c45d7173608 --- /dev/null +++ b/pycbc/fft/cupyfft.py @@ -0,0 +1,88 @@ +# Copyright (C) 2024 Y Ddraig Goch +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 3 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# +# ============================================================================= +# +# Preamble +# +# ============================================================================= +# +""" +This module provides the cupy backend of the fast Fourier transform +for the PyCBC package. +""" + +import logging +import cupy.fft +from .core import _check_fft_args +from .core import _BaseFFT, _BaseIFFT + +_INV_FFT_MSG = ("I cannot perform an {} between data with an input type of " + "{} and an output type of {}") + +def fft(invec, outvec, _, itype, otype): + if invec.ptr == outvec.ptr: + raise NotImplementedError("cupy backend of pycbc.fft does not " + "support in-place transforms") + if itype == 'complex' and otype == 'complex': + outvec.data[:] = cupy.asarray(cupy.fft.fft(invec.data), + dtype=outvec.dtype) + elif itype == 'real' and otype == 'complex': + outvec.data[:] = cupy.asarray(cupy.fft.rfft(invec.data), + dtype=outvec.dtype) + else: + raise ValueError(_INV_FFT_MSG.format("FFT", itype, otype)) + + +def ifft(invec, outvec, _, itype, otype): + if invec.ptr == outvec.ptr: + raise NotImplementedError("cupy backend of pycbc.fft does not " + "support in-place transforms") + if itype == 'complex' and otype == 'complex': + outvec.data[:] = cupy.asarray(cupy.fft.ifft(invec.data), + dtype=outvec.dtype) + outvec *= len(outvec) + elif itype == 'complex' and otype == 'real': + outvec.data[:] = cupy.asarray(cupy.fft.irfft(invec.data,len(outvec)), + dtype=outvec.dtype) + outvec *= len(outvec) + else: + raise ValueError(_INV_FFT_MSG.format("IFFT", itype, otype)) + + +class FFT(_BaseFFT): + """ + Class for performing FFTs via the cupy interface. + """ + def __init__(self, invec, outvec, nbatch=1, size=None): + super(FFT, self).__init__(invec, outvec, nbatch, size) + self.prec, self.itype, self.otype = _check_fft_args(invec, outvec) + + def execute(self): + fft(self.invec, self.outvec, self.prec, self.itype, self.otype) + + +class IFFT(_BaseIFFT): + """ + Class for performing IFFTs via the cupy interface. + """ + def __init__(self, invec, outvec, nbatch=1, size=None): + super(IFFT, self).__init__(invec, outvec, nbatch, size) + self.prec, self.itype, self.otype = _check_fft_args(invec, outvec) + + def execute(self): + ifft(self.invec, self.outvec, self.prec, self.itype, self.otype) diff --git a/pycbc/filter/matchedfilter_cupy.py b/pycbc/filter/matchedfilter_cupy.py new file mode 100644 index 00000000000..1a5a01a65fa --- /dev/null +++ b/pycbc/filter/matchedfilter_cupy.py @@ -0,0 +1,54 @@ +# Copyright (C) 2024 Y Ddraig Goch +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 3 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + +# +# ============================================================================= +# +# Preamble +# +# ============================================================================= +# + +import cupy as cp +from .matchedfilter import _BaseCorrelator + +# Here X,Y,Z are "type placeholder"s, so this covers 32 and 64 bit inputs. +# It would work for real -> real as well, except maybe conj would fail. +# I've also made this work for mixed types (32 bit -> 64 bit), but this means +# we need to always supply the output, which we do. +correlate_kernel = cp.ElementwiseKernel( + "X x, Y y", + "Z z", + "z = conj(x) * y", + "correlate_kernel" +) + +def correlate(a, b, out): + correlate_kernel(a.data, b.data, out.data) + +class CUPYCorrelator(_BaseCorrelator): + def __init__(self, x, y, z): + self.x = x.data + self.y = y.data + self.z = z.data + + def correlate(self): + correlate_kernel(self.x, self.y, self.z) + +def _correlate_factory(x, y, z): + return CUPYCorrelator + + diff --git a/pycbc/psd/estimate.py b/pycbc/psd/estimate.py index d69b5cadbb0..4a245a6584a 100644 --- a/pycbc/psd/estimate.py +++ b/pycbc/psd/estimate.py @@ -195,6 +195,7 @@ def welch(timeseries, seg_len=4096, seg_stride=2048, window='hann', median_bias(len(even_psds)) psd = (odd_median + even_median) / 2 + w = w.numpy() psd *= 2 * delta_f * seg_len / (w*w).sum() return FrequencySeries(psd, delta_f=delta_f, dtype=timeseries.dtype, diff --git a/pycbc/scheme.py b/pycbc/scheme.py index e6eb6f87f86..0a9e6740e1e 100644 --- a/pycbc/scheme.py +++ b/pycbc/scheme.py @@ -115,6 +115,30 @@ def __init__(self, device_num=0): import atexit atexit.register(clean_cuda,self.context) + +class CUPYScheme(Scheme): + """Scheme for using CUPY""" + def __init__(self, device_num=None): + import cupy # Fail now if cupy is not there. + import cupy.cuda + self.device_num = device_num + self.cuda_device = cupy.cuda.Device(self.device_num) + def __enter__(self): + super().__enter__() + self.cuda_device.__enter__() + logging.warn( + "You are using the CUPY GPU backend for PyCBC. This backend is " + "still only a prototype. It may be useful for your application " + "but it may fail unexpectedly, run slowly, or not give correct " + "output. Please do contribute to the effort to develop this " + "further." + ) + + def __exit__(self, *args): + super().__exit__(*args) + self.cuda_device.__exit__(*args) + + class CPUScheme(Scheme): def __init__(self, num_threads=1): if isinstance(num_threads, int): @@ -160,6 +184,7 @@ class NumpyScheme(CPUScheme): scheme_prefix = { CUDAScheme: "cuda", CPUScheme: "cpu", + CUPYScheme: "cupy", MKLScheme: "mkl", NumpyScheme: "numpy", } @@ -212,10 +237,10 @@ def _scheming_function(*args, **kwds): return schemed_fn(*args, **kwds) - err = """Failed to find implementation of (%s) - for %s scheme." % (str(fn), current_prefix())""" + err = (f"Failed to find implementation of {func.__name__} " + f"for {current_prefix()} scheme. ") for emsg in exc_errors: - err += print(emsg) + err += str(emsg) + " " raise RuntimeError(err) return _scheming_function @@ -292,6 +317,9 @@ def from_cli(opt): else: ctx = MKLScheme() logger.info("Running with MKL support: %s threads" % ctx.num_threads) + elif name == 'cupy': + logger.info("Running with CUPY support") + ctx = CUPYScheme() else: if len(scheme_str) > 1: numt = scheme_str[1] diff --git a/pycbc/types/array_cupy.py b/pycbc/types/array_cupy.py new file mode 100644 index 00000000000..a70a4f530e8 --- /dev/null +++ b/pycbc/types/array_cupy.py @@ -0,0 +1,140 @@ +# Copyright (C) 2024 Y Ddraig Goch +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 3 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + +# +# ============================================================================= +# +# Preamble +# +# ============================================================================= +# +"""Cupy based CPU backend for PyCBC Array +""" +import cupy as cp +from pycbc.types.array import common_kind, complex128, float64 +from scipy.linalg import blas +from pycbc.types import real_same_precision_as + +def zeros(length, dtype=cp.float64): + return cp.zeros(length, dtype=dtype) + +def empty(length, dtype=cp.float64): + return cp.empty(length, dtype=dtype) + +def ptr(self): + return self.data.data.mem.ptr + +def dot(self, other): + return cp.dot(self._data,other) + +def min(self): + return self.data.min() + +def abs_max_loc(self): + if self.kind == 'real': + tmp = abs(self.data) + ind = cp.argmax(tmp) + return tmp[ind], ind + else: + tmp = self.data.real ** 2.0 + tmp += self.data.imag ** 2.0 + ind = cp.argmax(tmp) + return tmp[ind] ** 0.5, ind + +def cumsum(self): + return self.data.cumsum() + +def max(self): + return self.data.max() + +def max_loc(self): + ind = cp.argmax(self.data) + return self.data[ind], ind + +def take(self, indices): + return self.data.take(indices) + +def weighted_inner(self, other, weight): + """ Return the inner product of the array with complex conjugation. + """ + if weight is None: + return self.inner(other) + + cdtype = common_kind(self.dtype, other.dtype) + if cdtype.kind == 'c': + acum_dtype = complex128 + else: + acum_dtype = float64 + + return cp.sum(self.data.conj() * other / weight, dtype=acum_dtype) + +def abs_arg_max(self): + if self.dtype == cp.float32 or self.dtype == cp.float64: + return cp.argmax(abs(self.data)) + else: + return abs_arg_max_complex(self._data) + +def inner(self, other): + """ Return the inner product of the array with complex conjugation. + """ + cdtype = common_kind(self.dtype, other.dtype) + if cdtype.kind == 'c': + return cp.sum(self.data.conj() * other, dtype=complex128) + else: + return inner_real(self.data, other) + +def vdot(self, other): + """ Return the inner product of the array with complex conjugation. + """ + return cp.vdot(self.data, other) + +def squared_norm(self): + """ Return the elementwise squared norm of the array """ + return (self.data.real**2 + self.data.imag**2) + +def numpy(self): + return cp.asnumpy(self.data) + +def _copy(self, self_ref, other_ref): + self_ref[:] = other_ref[:] + +def _getvalue(self, index): + return self._data[index] + +def sum(self): + if self.kind == 'real': + return cp.sum(self._data,dtype=float64) + else: + return cp.sum(self._data,dtype=complex128) + +def clear(self): + self[:] = 0 + +def _scheme_matches_base_array(array): + if isinstance(array, cp.ndarray): + return True + else: + return False + +def _to_device(array): + return cp.asarray(array) + +def numpy(self): + return cp.asnumpy(self._data) + +def _copy_base_array(array): + return array.copy() + diff --git a/pycbc/types/frequencyseries.py b/pycbc/types/frequencyseries.py index 2b0a0115fe9..d0d51e64af9 100644 --- a/pycbc/types/frequencyseries.py +++ b/pycbc/types/frequencyseries.py @@ -472,12 +472,13 @@ def to_timeseries(self, delta_t=None): tmp = self else: tmp = FrequencySeries(zeros(flen, dtype=self.dtype), - delta_f=self.delta_f, epoch=self.epoch) + delta_f=self.delta_f, epoch=self.epoch, + copy=False) tmp[:len(self)] = self[:] f = TimeSeries(zeros(tlen, dtype=real_same_precision_as(self)), - delta_t=delta_t) + delta_t=delta_t, copy=False) ifft(tmp, f) f._delta_t = delta_t return f diff --git a/pycbc/vetoes/chisq_cupy.py b/pycbc/vetoes/chisq_cupy.py new file mode 100644 index 00000000000..51b0b7e7912 --- /dev/null +++ b/pycbc/vetoes/chisq_cupy.py @@ -0,0 +1,345 @@ +# Copyright (C) 2015 Alex Nitz, Josh Willis +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 3 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + +# +# ============================================================================= +# +# Preamble +# +# ============================================================================= +# + +import functools +import numpy +import cupy as cp +import lal +from mako.template import Template + +LALARGS = { + 'TWOPI': lal.TWOPI, +} + +accum_diff_sq_kernel = cp.ElementwiseKernel( + "X input", + "raw Y output", + "output[i] += norm(input)", + "accum_diff_sq_kernel" +) + +def chisq_accum_bin(chisq, q): + accum_diff_sq_kernel(q.data, chisq.data) + + +chisqkernel = Template(""" +#include +extern "C" __global__ void power_chisq_at_points_${NP}( + %if fuse: + float2* htilde, + float2* stilde, + %else: + float2* corr, + %endif + float2* outc, unsigned int N, + %for p in range(NP): + float phase${p}, + %endfor + uint32_t* kmin, + uint32_t* kmax, + uint32_t* bv, + unsigned int nbins){ + __shared__ unsigned int s; + __shared__ unsigned int e; + __shared__ float2 chisq[${NT} * ${NP}]; + + // load integration boundaries (might not be bin boundaries if bin is large) + if (threadIdx.x == 0){ + s = kmin[blockIdx.x]; + e = kmax[blockIdx.x]; + } + + % for p in range(NP): + chisq[threadIdx.x + ${NT*p}].x = 0; + chisq[threadIdx.x + ${NT*p}].y = 0; + % endfor + __syncthreads(); + + // calculate the chisq integral for each thread + // sliding reduction for each thread from s, e + for (int i = threadIdx.x + s; i < e; i += blockDim.x){ + float re, im; + + %if fuse: + float2 qt, st, ht; + st = stilde[i]; + ht = htilde[i]; + qt.x = ht.x * st.x + ht.y * st.y; + qt.y = ht.x * st.y - ht.y * st.x; + %else: + float2 qt = corr[i]; + %endif + + %for p in range(NP): + sincosf(phase${p} * i, &im, &re); + chisq[threadIdx.x + ${NT*p}].x += re * qt.x - im * qt.y; + chisq[threadIdx.x + ${NT*p}].y += im * qt.x + re * qt.y; + %endfor + } + + float x, y, x2, y2; + // logarithmic reduction within thread block + for (int j=${NT} / 2; j>=1; j/=2){ + if (threadIdx.x +extern "C" __global__ void power_chisq_at_points_${NP}_pow2( + %if fuse: + float2* htilde, + float2* stilde, + %else: + float2* corr, + %endif + float2* outc, unsigned int N, + %for p in range(NP): + unsigned int points${p}, + %endfor + uint32_t* kmin, + uint32_t* kmax, + uint32_t* bv, + unsigned int nbins){ + __shared__ unsigned int s; + __shared__ unsigned int e; + __shared__ float2 chisq[${NT} * ${NP}]; + float twopi = ${TWOPI}; + unsigned long long NN; + + NN = (unsigned long long) N; + + // load integration boundaries (might not be bin boundaries if bin is large) + if (threadIdx.x == 0){ + s = kmin[blockIdx.x]; + e = kmax[blockIdx.x]; + } + + % for p in range(NP): + chisq[threadIdx.x + ${NT*p}].x = 0; + chisq[threadIdx.x + ${NT*p}].y = 0; + % endfor + __syncthreads(); + + // calculate the chisq integral for each thread + // sliding reduction for each thread from s, e + for (int i = threadIdx.x + s; i < e; i += blockDim.x){ + float re, im; + + %if fuse: + float2 qt, st, ht; + st = stilde[i]; + ht = htilde[i]; + qt.x = ht.x * st.x + ht.y * st.y; + qt.y = ht.x * st.y - ht.y * st.x; + %else: + float2 qt = corr[i]; + %endif + + %for p in range(NP): + unsigned long long prod${p} = points${p} * i; + unsigned int k${p} = (unsigned int) (prod${p}&(NN-1)); + float phase${p} = twopi * k${p}/((float) N); + __sincosf(phase${p}, &im, &re); + chisq[threadIdx.x + ${NT*p}].x += re * qt.x - im * qt.y; + chisq[threadIdx.x + ${NT*p}].y += im * qt.x + re * qt.y; + %endfor + } + + float x, y, x2, y2; + // logarithmic reduction within thread block + for (int j=${NT} / 2; j>=1; j/=2){ + if (threadIdx.x 0: + cargs = (corr, outp, lpoints, np, nb, N, kmin, kmax, bv, nbins) + + if np >= 4: + outp, lpoints, np = shift_sum_points_pow2(4, cargs) + elif np >= 3: + outp, lpoints, np = shift_sum_points_pow2(3, cargs) + elif np >= 2: + outp, lpoints, np = shift_sum_points_pow2(2, cargs) + elif np == 1: + outp, lpoints, np = shift_sum_points_pow2(1, cargs) + else: + phase = [numpy.float32(p * 2.0 * numpy.pi / N) for p in points] + while np > 0: + cargs = (corr, outp, phase, np, nb, N, kmin, kmax, bv, nbins) + + if np >= 4: + outp, phase, np = shift_sum_points(4, cargs) # pylint:disable=no-value-for-parameter + elif np >= 3: + outp, phase, np = shift_sum_points(3, cargs) # pylint:disable=no-value-for-parameter + elif np >= 2: + outp, phase, np = shift_sum_points(2, cargs) # pylint:disable=no-value-for-parameter + elif np == 1: + outp, phase, np = shift_sum_points(1, cargs) # pylint:disable=no-value-for-parameter + + return cp.asnumpy((outc.conj() * outc).sum(axis=1).real) + diff --git a/pycbc/waveform/spa_tmplt_cupy.py b/pycbc/waveform/spa_tmplt_cupy.py new file mode 100644 index 00000000000..63d429eff47 --- /dev/null +++ b/pycbc/waveform/spa_tmplt_cupy.py @@ -0,0 +1,100 @@ +# +# Apapted from code in LALSimInpspiralTaylorF2.c +# +# Copyright (C) 2007 Jolien Creighton, B.S. Sathyaprakash, Thomas Cokelaer +# Copyright (C) 2012 Leo Singer, Alex Nitz +# Adapted from code found in: +# - LALSimInspiralTaylorF2.c +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with with program; see the file COPYING. If not, write to the +# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, +# MA 02111-1307 USA + +import cupy as cp +import lal +import mako.template + +taylorf2_text = mako.template.Template(""" + const float f = (i + kmin ) * delta_f; + const float amp2 = amp * __powf(f, -7.0/6.0); + const float v = __powf(piM*f, 1.0/3.0); + const float v2 = v * v; + const float v3 = v2 * v; + const float v4 = v2 * v2; + const float v5 = v2 * v3; + const float v6 = v3 * v3; + const float v7 = v3 * v4; + float phasing = 0.; + + float LAL_TWOPI = ${TWOPI}; + float LAL_PI_4 = ${PI_4}; + float log4 = ${LN4}; + float logv = __logf(v); + + switch (phase_order) + { + case -1: + case 7: + phasing += pfa7 * v7; + case 6: + phasing += (pfa6 + pfl6 * (logv + log4) ) * v6; + case 5: + phasing += (pfa5 + pfl5 * (logv) ) * v5; + case 4: + phasing += pfa4 * v4; + case 3: + phasing += pfa3 * v3; + case 2: + phasing += pfa2 * v2; + case 0: + phasing += 1.; + break; + default: + break; + } + phasing *= pfaN / v5; + phasing -= LAL_PI_4; + phasing -= int(phasing / (LAL_TWOPI)) * LAL_TWOPI; + + float pcos; + float psin; + __sincosf(phasing, &psin, &pcos); + + htilde.real(pcos * amp2); + htilde.imag(-psin * amp2); +""").render(TWOPI=lal.TWOPI, PI_4=lal.PI_4, LN4=2*lal.LN2) + + +taylorf2_kernel = cp.ElementwiseKernel( + """ + int64 kmin, int64 phase_order, float32 delta_f, float32 piM, + float32 pfaN, float32 pfa2, float32 pfa3, float32 pfa4, float32 pfa5, + float32 pfl5, float32 pfa6, float32 pfl6, float32 pfa7, float32 amp + """, + "complex64 htilde", + taylorf2_text, + "taylorf2_kernel", +) + + +def spa_tmplt_engine(htilde, kmin, phase_order, + delta_f, piM, pfaN, + pfa2, pfa3, pfa4, pfa5, pfl5, + pfa6, pfl6, pfa7, amp_factor): + """ Calculate the spa tmplt phase + """ + taylorf2_kernel(kmin, phase_order, + delta_f, piM, pfaN, + pfa2, pfa3, pfa4, pfa5, pfl5, + pfa6, pfl6, pfa7, amp_factor, htilde.data) diff --git a/pycbc/waveform/utils_cupy.py b/pycbc/waveform/utils_cupy.py new file mode 100644 index 00000000000..22b1fdc5a76 --- /dev/null +++ b/pycbc/waveform/utils_cupy.py @@ -0,0 +1,54 @@ +# Copyright (C) 2018 Collin Capano, Josh Willis +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 3 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + +# +# ============================================================================= +# +# Preamble +# +# ============================================================================= +# +"""This module contains the CuPy-specific code for + convenience utilities for manipulating waveforms +""" +from pycbc.types import FrequencySeries +import cupy as xp + + + +def apply_fseries_time_shift(htilde, dt, kmin=0, copy=True): + """Shifts a frequency domain waveform in time. The waveform is assumed to + be sampled at equal frequency intervals. + """ + out = xp.array(htilde.data, copy=copy) + phi = -2j * xp.pi * dt * htilde.delta_f + kmax = len(htilde) + fstimeshift(out, phi, kmin, kmax) + if copy: + htilde = FrequencySeries(out, delta_f=htilde.delta_f, + epoch=htilde.epoch, copy=False) + return htilde + + +def fstimeshift(freqseries, phi, kmin, kmax): + # Please note that the CPU version has a number of optimizations with respect + # to this. + # FIXME: Convert to ElementwiseKernel and use kmin and max in that. + idx = xp.arange(len(freqseries)) + phase_shift = xp.exp(phi * idx) + freqseries[:] = freqseries[:] * phase_shift + diff --git a/pycbc/waveform/waveform.py b/pycbc/waveform/waveform.py index dd2476f4c78..628e97044ab 100644 --- a/pycbc/waveform/waveform.py +++ b/pycbc/waveform/waveform.py @@ -988,13 +988,16 @@ def get_sgburst_waveform(template=None, **kwargs): # Organize Filter Generators _inspiral_fd_filters = {} _cuda_fd_filters = {} +_cupy_fd_filters = {} _cuda_fd_filters['SPAtmplt'] = spa_tmplt +_cupy_fd_filters['SPAtmplt'] = spa_tmplt _inspiral_fd_filters['SPAtmplt'] = spa_tmplt filter_wav = _scheme.ChooseBySchemeDict() filter_wav.update( {_scheme.CPUScheme:_inspiral_fd_filters, _scheme.CUDAScheme:_cuda_fd_filters, + _scheme.CUPYScheme:_cupy_fd_filters, } ) # Organize functions for function conditioning/precalculated values diff --git a/tox.ini b/tox.ini index 4c160e9c846..2a231fd8e05 100644 --- a/tox.ini +++ b/tox.ini @@ -75,6 +75,10 @@ setenv = PYCBC_TEST_TYPE=inference commands = bash tools/pycbc_test_suite.sh [testenv:py-docs] +deps = + {[base]deps} + {[bbhx]deps} + cupy-cuda12x setenv = PYCBC_TEST_TYPE=docs commands = bash tools/pycbc_test_suite.sh