diff --git a/pycbc/events/threshold_cupy.py b/pycbc/events/threshold_cupy.py
new file mode 100644
index 00000000000..c20fe18aa13
--- /dev/null
+++ b/pycbc/events/threshold_cupy.py
@@ -0,0 +1,259 @@
+# Copyright (C) 2012  Alex Nitz
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
+# Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+
+
+#
+# =============================================================================
+#
+#                                   Preamble
+#
+# =============================================================================
+#
+
+import cupy as cp
+import functools
+import mako.template
+from .eventmgr import _BaseThresholdCluster
+import pycbc.scheme
+
+val = None
+loc = None
+
+# https://stackoverflow.com/questions/77798014/cupy-rawkernel-cuda-error-not-found-named-symbol-not-found-cupy
+
+tkernel1 = mako.template.Template("""
+extern "C" __global__ void threshold_and_cluster(float2* in, float2* outv, int* outl, int window, float threshold){
+    int s = window * blockIdx.x;
+    int e = s + window;
+
+    // shared memory for chuck size candidates
+    __shared__ float svr[${chunk}];
+    __shared__ float svi[${chunk}];
+    __shared__ int sl[${chunk}];
+
+    // shared memory for the warp size candidates
+    __shared__ float svv[32];
+    __shared__ int idx[32];
+
+    int ml = -1;
+    float mvr = 0;
+    float mvi = 0;
+    float re;
+    float im;
+
+    // Iterate trought the entire window size chunk and find blockDim.x number
+    // of candidates
+    for (int i = s + threadIdx.x; i < e; i += blockDim.x){
+        re = in[i].x;
+        im = in[i].y;
+        if ((re * re + im * im) > (mvr * mvr + mvi * mvi)){
+            mvr = re;
+            mvi = im;
+            ml = i;
+        }
+    }
+
+    // Save the candidate from this thread to shared memory
+    svr[threadIdx.x] = mvr;
+    svi[threadIdx.x] = mvi;
+    sl[threadIdx.x] = ml;
+
+    __syncthreads();
+
+    if (threadIdx.x < 32){
+        int tl = threadIdx.x;
+
+        // Now that we have all the candiates for this chunk in shared memory
+        // Iterate through in the warp size to reduce to 32 candidates
+        for (int i = threadIdx.x; i < ${chunk}; i += 32){
+            re = svr[i];
+            im = svi[i];
+            if ((re * re + im * im) > (mvr * mvr + mvi * mvi)){
+                tl = i;
+                mvr = re;
+                mvi = im;
+            }
+        }
+
+        // Store the 32 candidates into shared memory
+        svv[threadIdx.x] = svr[tl] * svr[tl] + svi[tl] * svi[tl];
+        idx[threadIdx.x] = tl;
+
+        // Find the 1 candidate we are looking for using a manual log algorithm
+        if ((threadIdx.x < 16) && (svv[threadIdx.x] < svv[threadIdx.x + 16])){
+            svv[threadIdx.x] = svv[threadIdx.x + 16];
+            idx[threadIdx.x] = idx[threadIdx.x + 16];
+        }
+
+        if ((threadIdx.x < 8) && (svv[threadIdx.x] < svv[threadIdx.x + 8])){
+            svv[threadIdx.x] = svv[threadIdx.x + 8];
+            idx[threadIdx.x] = idx[threadIdx.x + 8];
+        }
+
+        if ((threadIdx.x < 4) && (svv[threadIdx.x] < svv[threadIdx.x + 4])){
+            svv[threadIdx.x] = svv[threadIdx.x + 4];
+            idx[threadIdx.x] = idx[threadIdx.x + 4];
+        }
+
+        if ((threadIdx.x < 2) && (svv[threadIdx.x] < svv[threadIdx.x + 2])){
+            svv[threadIdx.x] = svv[threadIdx.x + 2];
+            idx[threadIdx.x] = idx[threadIdx.x + 2];
+        }
+
+
+        // Save the 1 candidate maximum and location to the output vectors
+        if (threadIdx.x == 0){
+            if (svv[threadIdx.x] < svv[threadIdx.x + 1]){
+                idx[0] = idx[1];
+                svv[0] = svv[1];
+            }
+
+            if (svv[0] > threshold){
+                tl = idx[0];
+                outv[blockIdx.x].x = svr[tl];
+                outv[blockIdx.x].y = svi[tl];
+                outl[blockIdx.x] = sl[tl];
+            } else{
+                outl[blockIdx.x] = -1;
+            }
+        }
+    }
+}
+""")
+
+tkernel2 = mako.template.Template("""
+extern "C" __global__ void threshold_and_cluster2(float2* outv, int* outl, float threshold, int window){
+    __shared__ int loc[${blocks}];
+    __shared__ float val[${blocks}];
+
+    int i = threadIdx.x;
+
+    int l = outl[i];
+    loc[i] = l;
+
+    if (l == -1)
+        return;
+
+    val[i] = outv[i].x * outv[i].x + outv[i].y * outv[i].y;
+
+
+    // Check right
+    if ( (i < (${blocks} - 1)) && (val[i + 1] > val[i]) ){
+        outl[i] = -1;
+        return;
+    }
+
+    // Check left
+    if ( (i > 0) && (val[i - 1] > val[i]) ){
+        outl[i] = -1;
+        return;
+    }
+}
+""")
+
+@functools.lru_cache(maxsize=None)
+def get_tkernel(slen, window):
+    if window < 32:
+        raise ValueError("GPU threshold kernel does not support a window smaller than 32 samples")
+
+    elif window <= 4096:
+        nt = 128
+    elif window <= 16384:
+        nt = 256
+    elif window <= 32768:
+        nt = 512
+    else:
+        nt = 1024
+
+    nb = int(cp.ceil(slen / float(window)))
+
+    if nb > 1024:
+        raise ValueError("More than 1024 blocks not supported yet")
+
+    fn = cp.RawKernel(
+        tkernel1.render(chunk=nt),
+        'threshold_and_cluster',
+        backend='nvcc'
+    )
+    fn2 = cp.RawKernel(
+        tkernel2.render(blocks=nb),
+        'threshold_and_cluster2',
+        backend='nvcc'
+    )
+    return (fn, fn2), nt, nb
+
+def threshold_and_cluster(series, threshold, window):
+    global val
+    global loc
+    if val is None:
+        val = cp.zeros(4096*256, dtype=cp.complex64)
+    if loc is None:
+        loc = cp.zeros(4096*256, cp.int32)
+
+    outl = loc
+    outv = val
+    slen = len(series)
+    series = series.data
+    (fn, fn2), nt, nb = get_tkernel(slen, window)
+    threshold = cp.float32(threshold * threshold)
+    window = cp.int32(window)
+
+    cl = loc[0:nb]
+    cv = val[0:nb]
+
+    fn((nb,), (nt,), (series.data, outv, outl, window, threshold))
+    fn2((1,), (nb,), (outv, outl, threshold, window))
+    w = (cl != -1)
+    return cv[w], cl[w]
+
+class CUDAThresholdCluster(_BaseThresholdCluster):
+    def __init__(self, series):
+        self.series = series
+
+        global val
+        global loc
+        if val is None:
+            val = cp.zeros(4096*256, dtype=cp.complex64)
+        if loc is None:
+            loc = cp.zeros(4096*256, cp.int32)
+
+        self.outl = loc
+        self.outv = val
+        self.slen = len(series)
+
+    def threshold_and_cluster(self, threshold, window):
+        threshold = cp.float32(threshold * threshold)
+        window = cp.int32(window)
+
+        (fn, fn2), nt, nb = get_tkernel(self.slen, window)
+        cl = loc[0:nb]
+        cv = val[0:nb]
+
+        fn(
+            (nt, 1, 1),
+            (nb, 1),
+            (self.series.data, self.outv, self.outl, window, threshold)
+        )
+        fn2(
+            (nb, 1, 1),
+            (1, 1),
+            (self.outv, self.outl, threshold, window)
+        )
+        w = (cl != -1)
+        return cp.asnumpy(cv[w]), cp.asnumpy(cl[w])
+
+def _threshold_cluster_factory(series):
+    return CUDAThresholdCluster
+
diff --git a/pycbc/fft/backend_cupy.py b/pycbc/fft/backend_cupy.py
new file mode 100644
index 00000000000..fec13ad746b
--- /dev/null
+++ b/pycbc/fft/backend_cupy.py
@@ -0,0 +1,37 @@
+#  Copyright (C) 2024 Y Ddraig Goch
+#
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation; either version 2 of the License, or
+#  (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with with program; see the file COPYING. If not, write to the
+#  Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+#  MA  02111-1307  USA
+
+from .core import _list_available
+
+_backend_dict = {'cupy' : 'cupyfft'}
+_backend_list = ['cupy']
+
+_alist, _adict = _list_available(_backend_list, _backend_dict)
+
+cupy_backend = None
+
+def set_backend(backend_list):
+    global cupy_backend
+    for backend in backend_list:
+        if backend in _alist:
+            cupy_backend = backend
+            break
+
+def get_backend():
+    return _adict[cupy_backend]
+
+set_backend(_backend_list)
diff --git a/pycbc/fft/backend_support.py b/pycbc/fft/backend_support.py
index cae8c69eb19..ed1c76c7fe8 100644
--- a/pycbc/fft/backend_support.py
+++ b/pycbc/fft/backend_support.py
@@ -72,7 +72,7 @@ def get_backend():
 
 # Import all scheme-dependent backends, to get _all_backends accurate:
 
-for scheme_name in ["cpu", "mkl", "cuda"]:
+for scheme_name in ["cpu", "mkl", "cuda", "cupy"]:
     try:
         mod = __import__('pycbc.fft.backend_' + scheme_name, fromlist = ['_alist', '_adict'])
         _alist = getattr(mod, "_alist")
diff --git a/pycbc/fft/cupyfft.py b/pycbc/fft/cupyfft.py
new file mode 100644
index 00000000000..c45d7173608
--- /dev/null
+++ b/pycbc/fft/cupyfft.py
@@ -0,0 +1,88 @@
+# Copyright (C) 2024 Y Ddraig Goch
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
+# Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+
+#
+# =============================================================================
+#
+#                                   Preamble
+#
+# =============================================================================
+#
+"""
+This module provides the cupy backend of the fast Fourier transform
+for the PyCBC package.
+"""
+
+import logging
+import cupy.fft
+from .core import _check_fft_args
+from .core import _BaseFFT, _BaseIFFT
+
+_INV_FFT_MSG = ("I cannot perform an {} between data with an input type of "
+                "{} and an output type of {}")
+
+def fft(invec, outvec, _, itype, otype):
+    if invec.ptr == outvec.ptr:
+        raise NotImplementedError("cupy backend of pycbc.fft does not "
+                                  "support in-place transforms")
+    if itype == 'complex' and otype == 'complex':
+        outvec.data[:] = cupy.asarray(cupy.fft.fft(invec.data),
+                                       dtype=outvec.dtype)
+    elif itype == 'real' and otype == 'complex':
+        outvec.data[:] = cupy.asarray(cupy.fft.rfft(invec.data),
+                                       dtype=outvec.dtype)
+    else:
+        raise ValueError(_INV_FFT_MSG.format("FFT", itype, otype))
+
+
+def ifft(invec, outvec, _, itype, otype):
+    if invec.ptr == outvec.ptr:
+        raise NotImplementedError("cupy backend of pycbc.fft does not "
+                                  "support in-place transforms")
+    if itype == 'complex' and otype == 'complex':
+        outvec.data[:] = cupy.asarray(cupy.fft.ifft(invec.data),
+                                       dtype=outvec.dtype)
+        outvec *= len(outvec)
+    elif itype == 'complex' and otype == 'real':
+        outvec.data[:] = cupy.asarray(cupy.fft.irfft(invec.data,len(outvec)),
+                                       dtype=outvec.dtype)
+        outvec *= len(outvec)
+    else:
+        raise ValueError(_INV_FFT_MSG.format("IFFT", itype, otype))
+
+
+class FFT(_BaseFFT):
+    """
+    Class for performing FFTs via the cupy interface.
+    """
+    def __init__(self, invec, outvec, nbatch=1, size=None):
+        super(FFT, self).__init__(invec, outvec, nbatch, size)
+        self.prec, self.itype, self.otype = _check_fft_args(invec, outvec)
+
+    def execute(self):
+        fft(self.invec, self.outvec, self.prec, self.itype, self.otype)
+
+
+class IFFT(_BaseIFFT):
+    """
+    Class for performing IFFTs via the cupy interface.
+    """
+    def __init__(self, invec, outvec, nbatch=1, size=None):
+        super(IFFT, self).__init__(invec, outvec, nbatch, size)
+        self.prec, self.itype, self.otype = _check_fft_args(invec, outvec)
+
+    def execute(self):
+        ifft(self.invec, self.outvec, self.prec, self.itype, self.otype)
diff --git a/pycbc/filter/matchedfilter_cupy.py b/pycbc/filter/matchedfilter_cupy.py
new file mode 100644
index 00000000000..1a5a01a65fa
--- /dev/null
+++ b/pycbc/filter/matchedfilter_cupy.py
@@ -0,0 +1,54 @@
+# Copyright (C) 2024 Y Ddraig Goch
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
+# Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+
+
+#
+# =============================================================================
+#
+#                                   Preamble
+#
+# =============================================================================
+#
+
+import cupy as cp
+from .matchedfilter import _BaseCorrelator
+
+# Here X,Y,Z are "type placeholder"s, so this covers 32 and 64 bit inputs.
+# It would work for real -> real as well, except maybe conj would fail.
+# I've also made this work for mixed types (32 bit -> 64 bit), but this means
+# we need to always supply the output, which we do.
+correlate_kernel = cp.ElementwiseKernel(
+    "X x, Y y",
+    "Z z",
+    "z = conj(x) * y",
+    "correlate_kernel"
+)
+
+def correlate(a, b, out):
+    correlate_kernel(a.data, b.data, out.data)
+
+class CUPYCorrelator(_BaseCorrelator):
+    def __init__(self, x, y, z):
+        self.x = x.data
+        self.y = y.data
+        self.z = z.data
+
+    def correlate(self):
+        correlate_kernel(self.x, self.y, self.z)
+
+def _correlate_factory(x, y, z):
+    return CUPYCorrelator
+
+
diff --git a/pycbc/psd/estimate.py b/pycbc/psd/estimate.py
index d69b5cadbb0..4a245a6584a 100644
--- a/pycbc/psd/estimate.py
+++ b/pycbc/psd/estimate.py
@@ -195,6 +195,7 @@ def welch(timeseries, seg_len=4096, seg_stride=2048, window='hann',
             median_bias(len(even_psds))
         psd = (odd_median + even_median) / 2
 
+    w = w.numpy()
     psd *= 2 * delta_f * seg_len / (w*w).sum()
 
     return FrequencySeries(psd, delta_f=delta_f, dtype=timeseries.dtype,
diff --git a/pycbc/scheme.py b/pycbc/scheme.py
index e6eb6f87f86..0a9e6740e1e 100644
--- a/pycbc/scheme.py
+++ b/pycbc/scheme.py
@@ -115,6 +115,30 @@ def __init__(self, device_num=0):
         import atexit
         atexit.register(clean_cuda,self.context)
 
+
+class CUPYScheme(Scheme):
+    """Scheme for using CUPY"""
+    def __init__(self, device_num=None):
+        import cupy # Fail now if cupy is not there.
+        import cupy.cuda
+        self.device_num = device_num
+        self.cuda_device = cupy.cuda.Device(self.device_num)
+    def __enter__(self):
+        super().__enter__()
+        self.cuda_device.__enter__()
+        logging.warn(
+            "You are using the CUPY GPU backend for PyCBC. This backend is "
+            "still only a prototype. It may be useful for your application "
+            "but it may fail unexpectedly, run slowly, or not give correct "
+            "output. Please do contribute to the effort to develop this "
+            "further."
+        )
+
+    def __exit__(self, *args):
+        super().__exit__(*args)
+        self.cuda_device.__exit__(*args)
+
+
 class CPUScheme(Scheme):
     def __init__(self, num_threads=1):
         if isinstance(num_threads, int):
@@ -160,6 +184,7 @@ class NumpyScheme(CPUScheme):
 scheme_prefix = {
     CUDAScheme: "cuda",
     CPUScheme: "cpu",
+    CUPYScheme: "cupy",
     MKLScheme: "mkl",
     NumpyScheme: "numpy",
 }
@@ -212,10 +237,10 @@ def _scheming_function(*args, **kwds):
 
                     return schemed_fn(*args, **kwds)
 
-                err = """Failed to find implementation of (%s)
-                      for %s scheme." % (str(fn), current_prefix())"""
+                err = (f"Failed to find implementation of {func.__name__} "
+                       f"for {current_prefix()} scheme. ")
                 for emsg in exc_errors:
-                    err += print(emsg)
+                    err += str(emsg) + " "
                 raise RuntimeError(err)
         return _scheming_function
 
@@ -292,6 +317,9 @@ def from_cli(opt):
         else:
             ctx = MKLScheme()
         logger.info("Running with MKL support: %s threads" % ctx.num_threads)
+    elif name == 'cupy':
+        logger.info("Running with CUPY support")
+        ctx = CUPYScheme()
     else:
         if len(scheme_str) > 1:
             numt = scheme_str[1]
diff --git a/pycbc/types/array_cupy.py b/pycbc/types/array_cupy.py
new file mode 100644
index 00000000000..a70a4f530e8
--- /dev/null
+++ b/pycbc/types/array_cupy.py
@@ -0,0 +1,140 @@
+# Copyright (C) 2024 Y Ddraig Goch
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
+# Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+
+
+#
+# =============================================================================
+#
+#                                   Preamble
+#
+# =============================================================================
+#
+"""Cupy based CPU backend for PyCBC Array
+"""
+import cupy as cp
+from pycbc.types.array import common_kind, complex128, float64
+from scipy.linalg import blas
+from pycbc.types import real_same_precision_as
+
+def zeros(length, dtype=cp.float64):
+    return cp.zeros(length, dtype=dtype)
+
+def empty(length, dtype=cp.float64):
+    return cp.empty(length, dtype=dtype)
+
+def ptr(self):
+    return self.data.data.mem.ptr
+
+def dot(self, other):
+    return cp.dot(self._data,other)
+
+def min(self):
+    return self.data.min()
+
+def abs_max_loc(self):
+    if self.kind == 'real':
+        tmp = abs(self.data)
+        ind = cp.argmax(tmp)
+        return tmp[ind], ind
+    else:
+        tmp = self.data.real ** 2.0
+        tmp += self.data.imag ** 2.0
+        ind = cp.argmax(tmp)
+        return tmp[ind] ** 0.5, ind
+
+def cumsum(self):
+    return self.data.cumsum()
+
+def max(self):
+    return self.data.max()
+
+def max_loc(self):
+    ind = cp.argmax(self.data)
+    return self.data[ind], ind
+
+def take(self, indices):
+    return self.data.take(indices)
+
+def weighted_inner(self, other, weight):
+    """ Return the inner product of the array with complex conjugation.
+    """
+    if weight is None:
+        return self.inner(other)
+
+    cdtype = common_kind(self.dtype, other.dtype)
+    if cdtype.kind == 'c':
+        acum_dtype = complex128
+    else:
+        acum_dtype = float64
+
+    return cp.sum(self.data.conj() * other / weight, dtype=acum_dtype)
+
+def abs_arg_max(self):
+    if self.dtype == cp.float32 or self.dtype == cp.float64:
+        return cp.argmax(abs(self.data))
+    else:
+        return abs_arg_max_complex(self._data)
+
+def inner(self, other):
+    """ Return the inner product of the array with complex conjugation.
+    """
+    cdtype = common_kind(self.dtype, other.dtype)
+    if cdtype.kind == 'c':
+        return cp.sum(self.data.conj() * other, dtype=complex128)
+    else:
+        return inner_real(self.data, other)
+
+def vdot(self, other):
+    """ Return the inner product of the array with complex conjugation.
+    """
+    return cp.vdot(self.data, other)
+
+def squared_norm(self):
+    """ Return the elementwise squared norm of the array """
+    return (self.data.real**2 + self.data.imag**2)
+
+def numpy(self):
+    return cp.asnumpy(self.data)
+
+def _copy(self, self_ref, other_ref):
+    self_ref[:] = other_ref[:]
+
+def _getvalue(self, index):
+    return self._data[index]
+
+def sum(self):
+    if self.kind == 'real':
+        return cp.sum(self._data,dtype=float64)
+    else:
+        return cp.sum(self._data,dtype=complex128)
+
+def clear(self):
+    self[:] = 0
+
+def _scheme_matches_base_array(array):
+    if isinstance(array, cp.ndarray):
+        return True
+    else:
+        return False
+
+def _to_device(array):
+    return cp.asarray(array)
+
+def numpy(self):
+    return cp.asnumpy(self._data)
+
+def _copy_base_array(array):
+    return array.copy()
+
diff --git a/pycbc/types/frequencyseries.py b/pycbc/types/frequencyseries.py
index 2b0a0115fe9..d0d51e64af9 100644
--- a/pycbc/types/frequencyseries.py
+++ b/pycbc/types/frequencyseries.py
@@ -472,12 +472,13 @@ def to_timeseries(self, delta_t=None):
             tmp = self
         else:
             tmp = FrequencySeries(zeros(flen, dtype=self.dtype),
-                             delta_f=self.delta_f, epoch=self.epoch)
+                             delta_f=self.delta_f, epoch=self.epoch,
+                             copy=False)
             tmp[:len(self)] = self[:]
 
         f = TimeSeries(zeros(tlen,
                            dtype=real_same_precision_as(self)),
-                           delta_t=delta_t)
+                           delta_t=delta_t, copy=False)
         ifft(tmp, f)
         f._delta_t = delta_t
         return f
diff --git a/pycbc/vetoes/chisq_cupy.py b/pycbc/vetoes/chisq_cupy.py
new file mode 100644
index 00000000000..51b0b7e7912
--- /dev/null
+++ b/pycbc/vetoes/chisq_cupy.py
@@ -0,0 +1,345 @@
+# Copyright (C) 2015  Alex Nitz, Josh Willis
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
+# Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+
+
+#
+# =============================================================================
+#
+#                                   Preamble
+#
+# =============================================================================
+#
+
+import functools
+import numpy
+import cupy as cp
+import lal
+from mako.template import Template
+
+LALARGS = {
+    'TWOPI': lal.TWOPI,
+}
+
+accum_diff_sq_kernel = cp.ElementwiseKernel(
+    "X input",
+    "raw Y output",
+    "output[i] += norm(input)",
+    "accum_diff_sq_kernel"
+)
+
+def chisq_accum_bin(chisq, q):
+    accum_diff_sq_kernel(q.data, chisq.data)
+
+
+chisqkernel = Template("""
+#include <cstdint>
+extern "C" __global__ void power_chisq_at_points_${NP}(
+                                      %if fuse:
+                                          float2* htilde,
+                                          float2* stilde,
+                                      %else:
+                                          float2* corr,
+                                      %endif
+                                      float2* outc, unsigned int N,
+                                      %for p in range(NP):
+                                        float phase${p},
+                                      %endfor
+                                      uint32_t* kmin,
+                                      uint32_t* kmax,
+                                      uint32_t* bv,
+                                      unsigned int nbins){
+    __shared__ unsigned int s;
+    __shared__ unsigned int e;
+    __shared__ float2 chisq[${NT} * ${NP}];
+
+    // load integration boundaries (might not be bin boundaries if bin is large)
+    if (threadIdx.x == 0){
+        s = kmin[blockIdx.x];
+        e = kmax[blockIdx.x];
+    }
+
+    % for p in range(NP):
+        chisq[threadIdx.x + ${NT*p}].x = 0;
+        chisq[threadIdx.x + ${NT*p}].y = 0;
+    % endfor
+    __syncthreads();
+
+    // calculate the chisq integral for each thread
+    // sliding reduction for each thread from s, e
+    for (int i = threadIdx.x + s; i < e; i += blockDim.x){
+        float re, im;
+
+        %if fuse:
+            float2 qt, st, ht;
+            st = stilde[i];
+            ht = htilde[i];
+            qt.x = ht.x * st.x + ht.y * st.y;
+            qt.y = ht.x * st.y - ht.y * st.x;
+        %else:
+            float2 qt = corr[i];
+        %endif
+
+        %for p in range(NP):
+            sincosf(phase${p} * i, &im, &re);
+            chisq[threadIdx.x + ${NT*p}].x += re * qt.x - im * qt.y;
+            chisq[threadIdx.x + ${NT*p}].y += im * qt.x + re * qt.y;
+        %endfor
+    }
+
+    float x, y, x2, y2;
+    // logarithmic reduction within thread block
+    for (int j=${NT} / 2; j>=1; j/=2){
+        if (threadIdx.x <j){
+            %for p in range(NP):
+                __syncthreads();
+                x = chisq[threadIdx.x + ${NT*p}].x;
+                y = chisq[threadIdx.x + ${NT*p}].y;
+                x2 = chisq[threadIdx.x + j + ${NT*p}].x;
+                y2 = chisq[threadIdx.x + j + ${NT*p}].y;
+                 __syncthreads();
+                chisq[threadIdx.x + ${NT*p}].x = x + x2;
+                chisq[threadIdx.x + ${NT*p}].y = y + y2;
+            %endfor
+        }
+    }
+
+    if (threadIdx.x == 0){
+        % for p in range(NP):
+            atomicAdd(&outc[bv[blockIdx.x] + nbins * ${p}].x, chisq[0 + ${NT*p}].x);
+            atomicAdd(&outc[bv[blockIdx.x] + nbins * ${p}].y, chisq[0 + ${NT*p}].y);
+        % endfor
+    }
+
+}
+""")
+
+chisqkernel_pow2 = Template("""
+#include <cstdint>
+extern "C" __global__ void power_chisq_at_points_${NP}_pow2(
+                                      %if fuse:
+                                          float2* htilde,
+                                          float2* stilde,
+                                      %else:
+                                          float2* corr,
+                                      %endif
+                                      float2* outc, unsigned int N,
+                                      %for p in range(NP):
+                                        unsigned int points${p},
+                                      %endfor
+                                      uint32_t* kmin,
+                                      uint32_t* kmax,
+                                      uint32_t* bv,
+                                      unsigned int nbins){
+    __shared__ unsigned int s;
+    __shared__ unsigned int e;
+    __shared__ float2 chisq[${NT} * ${NP}];
+    float twopi = ${TWOPI};
+    unsigned long long NN;
+
+    NN = (unsigned long long) N;
+
+    // load integration boundaries (might not be bin boundaries if bin is large)
+    if (threadIdx.x == 0){
+        s = kmin[blockIdx.x];
+        e = kmax[blockIdx.x];
+    }
+
+    % for p in range(NP):
+        chisq[threadIdx.x + ${NT*p}].x = 0;
+        chisq[threadIdx.x + ${NT*p}].y = 0;
+    % endfor
+    __syncthreads();
+
+    // calculate the chisq integral for each thread
+    // sliding reduction for each thread from s, e
+    for (int i = threadIdx.x + s; i < e; i += blockDim.x){
+        float re, im;
+
+        %if fuse:
+            float2 qt, st, ht;
+            st = stilde[i];
+            ht = htilde[i];
+            qt.x = ht.x * st.x + ht.y * st.y;
+            qt.y = ht.x * st.y - ht.y * st.x;
+        %else:
+            float2 qt = corr[i];
+        %endif
+
+        %for p in range(NP):
+            unsigned long long prod${p} = points${p} * i;
+            unsigned int k${p} = (unsigned int) (prod${p}&(NN-1));
+            float phase${p} = twopi * k${p}/((float) N);
+            __sincosf(phase${p}, &im, &re);
+            chisq[threadIdx.x + ${NT*p}].x += re * qt.x - im * qt.y;
+            chisq[threadIdx.x + ${NT*p}].y += im * qt.x + re * qt.y;
+        %endfor
+    }
+
+    float x, y, x2, y2;
+    // logarithmic reduction within thread block
+    for (int j=${NT} / 2; j>=1; j/=2){
+        if (threadIdx.x <j){
+            %for p in range(NP):
+                __syncthreads();
+                x = chisq[threadIdx.x + ${NT*p}].x;
+                y = chisq[threadIdx.x + ${NT*p}].y;
+                x2 = chisq[threadIdx.x + j + ${NT*p}].x;
+                y2 = chisq[threadIdx.x + j + ${NT*p}].y;
+                 __syncthreads();
+                chisq[threadIdx.x + ${NT*p}].x = x + x2;
+                chisq[threadIdx.x + ${NT*p}].y = y + y2;
+            %endfor
+        }
+    }
+
+    if (threadIdx.x == 0){
+        % for p in range(NP):
+            atomicAdd(&outc[bv[blockIdx.x] + nbins * ${p}].x, chisq[0 + ${NT*p}].x);
+            atomicAdd(&outc[bv[blockIdx.x] + nbins * ${p}].y, chisq[0 + ${NT*p}].y);
+        % endfor
+    }
+
+}
+""")
+
+@functools.lru_cache(maxsize=None)
+def get_pchisq_fn(np, fuse_correlate=False):
+    nt = 256
+    fn = cp.RawKernel(
+        chisqkernel.render(NT=nt, NP=np, fuse=fuse_correlate, **LALARGS),
+        f'power_chisq_at_points_{np}',
+        backend='nvcc'
+    )
+    return fn, nt
+
+
+@functools.lru_cache(maxsize=None)
+def get_pchisq_fn_pow2(np, fuse_correlate=False):
+    nt = 256
+    fn = cp.RawKernel(
+        chisqkernel_pow2.render(NT=nt, NP=np, fuse=fuse_correlate, **LALARGS),
+        f'power_chisq_at_points_{np}_pow2',
+        backend='nvcc'
+    )
+    return fn, nt
+
+def get_cached_bin_layout(bins):
+    bv, kmin, kmax = [], [], []
+    for i in range(len(bins)-1):
+        s, e = bins[i], bins[i+1]
+        BS = 4096
+        if (e - s) < BS:
+            bv.append(i)
+            kmin.append(s)
+            kmax.append(e)
+        else:
+            k = list(numpy.arange(s, e, BS/2))
+            kmin += k
+            kmax += k[1:] + [e]
+            bv += [i]*len(k)
+    bv = cp.array(bv, dtype=cp.uint32)
+    kmin = cp.array(kmin, dtype=cp.uint32)
+    kmax = cp.array(kmax, dtype=cp.uint32)
+    return kmin, kmax, bv
+
+def shift_sum_points(num, N, arg_tuple):
+    #fuse = 'fuse' in corr.gpu_callback_method
+    fuse = False
+
+    fn, nt = get_pchisq_fn(num, fuse_correlate = fuse)
+    corr, outp, phase, np, nb, N, kmin, kmax, bv, nbins = arg_tuple
+    if fuse:
+        args = [corr.htilde.data, corr.stilde.data]
+    else:
+        args = [corr.data]
+    args += [outp, N] + phase[0:num]
+    args += [kmin, kmax, bv, nbins]
+    fn(
+        (nb,),
+        (nt,),
+        *args,
+    )
+    
+    outp = outp[num*nbins:]
+    phase = phase[num:]
+    np -= num
+    return outp, phase, np
+
+def shift_sum_points_pow2(num, arg_tuple):
+    #fuse = 'fuse' in corr.gpu_callback_method
+    fuse = False
+
+    fn, nt = get_pchisq_fn_pow2(num, fuse_correlate = fuse)
+
+    corr, outp, points, np, nb, N, kmin, kmax, bv, nbins = arg_tuple
+    if fuse:
+        args = [corr.htilde.data, corr.stilde.data]
+    else:
+        args = [corr.data]
+    args += [outp, N] + points[0:num] + [kmin, kmax, bv, nbins]
+    fn(
+        (nb,),
+        (nt,),
+        tuple(args)
+    )
+            
+    outp = outp[num*nbins:]
+    points = points[num:]
+    np -= num
+    return outp, points, np
+
+@functools.lru_cache(maxsize=None)
+def get_cached_pow2(N):
+    return not(N & (N-1))
+
+def shift_sum(corr, points, bins):
+    kmin, kmax, bv = get_cached_bin_layout(bins)
+    nb = len(kmin)
+    N = numpy.uint32(len(corr))
+    is_pow2 = get_cached_pow2(N)
+    nbins = numpy.uint32(len(bins) - 1)
+    outc = cp.zeros((len(points), nbins), dtype=numpy.complex64)
+    outp = outc.reshape(nbins * len(points))
+    np = len(points)
+
+    if is_pow2:
+        lpoints = points.tolist()
+        while np > 0:
+            cargs = (corr, outp, lpoints, np, nb, N, kmin, kmax, bv, nbins)
+
+            if np >= 4:
+                outp, lpoints, np = shift_sum_points_pow2(4, cargs)
+            elif np >= 3:
+                outp, lpoints, np = shift_sum_points_pow2(3, cargs)
+            elif np >= 2:
+                outp, lpoints, np = shift_sum_points_pow2(2, cargs)
+            elif np == 1:
+                outp, lpoints, np = shift_sum_points_pow2(1, cargs)
+    else:
+        phase = [numpy.float32(p * 2.0 * numpy.pi / N) for p in points]
+        while np > 0:
+            cargs = (corr, outp, phase, np, nb, N, kmin, kmax, bv, nbins)
+
+            if np >= 4:
+                outp, phase, np = shift_sum_points(4, cargs) # pylint:disable=no-value-for-parameter
+            elif np >= 3:
+                outp, phase, np = shift_sum_points(3, cargs) # pylint:disable=no-value-for-parameter
+            elif np >= 2:
+                outp, phase, np = shift_sum_points(2, cargs) # pylint:disable=no-value-for-parameter
+            elif np == 1:
+                outp, phase, np = shift_sum_points(1, cargs) # pylint:disable=no-value-for-parameter
+
+    return cp.asnumpy((outc.conj() * outc).sum(axis=1).real)
+
diff --git a/pycbc/waveform/spa_tmplt_cupy.py b/pycbc/waveform/spa_tmplt_cupy.py
new file mode 100644
index 00000000000..63d429eff47
--- /dev/null
+++ b/pycbc/waveform/spa_tmplt_cupy.py
@@ -0,0 +1,100 @@
+#
+#  Apapted from code in LALSimInpspiralTaylorF2.c
+#
+#  Copyright (C) 2007 Jolien Creighton, B.S. Sathyaprakash, Thomas Cokelaer
+#  Copyright (C) 2012 Leo Singer, Alex Nitz
+#  Adapted from code found in:
+#    - LALSimInspiralTaylorF2.c
+#
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation; either version 2 of the License, or
+#  (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with with program; see the file COPYING. If not, write to the
+#  Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+#  MA  02111-1307  USA
+
+import cupy as cp
+import lal
+import mako.template
+
+taylorf2_text = mako.template.Template("""
+    const float f = (i + kmin ) * delta_f;
+    const float amp2 = amp * __powf(f, -7.0/6.0);
+    const float v =  __powf(piM*f, 1.0/3.0);
+    const float v2 = v * v;
+    const float v3 = v2 * v;
+    const float v4 = v2 * v2;
+    const float v5 = v2 * v3;
+    const float v6 = v3 * v3;
+    const float v7 = v3 * v4;
+    float phasing = 0.;
+
+    float LAL_TWOPI = ${TWOPI};
+    float LAL_PI_4 = ${PI_4};
+    float log4 = ${LN4};
+    float logv = __logf(v);
+
+    switch (phase_order)
+    {
+        case -1:
+        case 7:
+            phasing += pfa7 * v7;
+        case 6:
+            phasing += (pfa6 + pfl6 * (logv + log4) ) * v6;
+        case 5:
+            phasing += (pfa5 + pfl5 * (logv) ) * v5;
+        case 4:
+            phasing += pfa4 * v4;
+        case 3:
+            phasing += pfa3 * v3;
+        case 2:
+            phasing += pfa2 * v2;
+        case 0:
+            phasing += 1.;
+            break;
+        default:
+            break;
+    }
+    phasing *= pfaN / v5;
+    phasing -=  LAL_PI_4;
+    phasing -= int(phasing / (LAL_TWOPI)) * LAL_TWOPI;
+
+    float pcos;
+    float psin;
+    __sincosf(phasing, &psin, &pcos);
+
+    htilde.real(pcos * amp2);
+    htilde.imag(-psin * amp2);
+""").render(TWOPI=lal.TWOPI, PI_4=lal.PI_4, LN4=2*lal.LN2)
+
+
+taylorf2_kernel = cp.ElementwiseKernel(
+    """
+        int64 kmin, int64 phase_order, float32 delta_f, float32 piM,
+        float32 pfaN, float32 pfa2, float32 pfa3, float32 pfa4, float32 pfa5,
+        float32 pfl5, float32 pfa6, float32 pfl6, float32 pfa7, float32 amp
+    """,
+    "complex64 htilde",
+    taylorf2_text,
+    "taylorf2_kernel",
+)
+
+
+def spa_tmplt_engine(htilde,  kmin,  phase_order,
+                    delta_f,  piM,  pfaN,
+                    pfa2,  pfa3,  pfa4,  pfa5,  pfl5,
+                    pfa6,  pfl6,  pfa7, amp_factor):
+    """ Calculate the spa tmplt phase
+    """
+    taylorf2_kernel(kmin,  phase_order,
+                    delta_f,  piM,  pfaN,
+                    pfa2,  pfa3,  pfa4,  pfa5,  pfl5,
+                    pfa6,  pfl6,  pfa7, amp_factor, htilde.data)
diff --git a/pycbc/waveform/utils_cupy.py b/pycbc/waveform/utils_cupy.py
new file mode 100644
index 00000000000..22b1fdc5a76
--- /dev/null
+++ b/pycbc/waveform/utils_cupy.py
@@ -0,0 +1,54 @@
+# Copyright (C) 2018 Collin Capano, Josh Willis
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
+# Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+
+
+#
+# =============================================================================
+#
+#                                   Preamble
+#
+# =============================================================================
+#
+"""This module contains the CuPy-specific code for
+   convenience utilities for manipulating waveforms
+"""
+from pycbc.types import FrequencySeries
+import cupy as xp
+
+
+
+def apply_fseries_time_shift(htilde, dt, kmin=0, copy=True):
+    """Shifts a frequency domain waveform in time. The waveform is assumed to
+    be sampled at equal frequency intervals.
+    """
+    out = xp.array(htilde.data, copy=copy)
+    phi = -2j * xp.pi * dt * htilde.delta_f
+    kmax = len(htilde)
+    fstimeshift(out, phi, kmin, kmax)
+    if copy:
+        htilde = FrequencySeries(out, delta_f=htilde.delta_f,
+                                 epoch=htilde.epoch, copy=False)
+    return htilde
+
+
+def fstimeshift(freqseries, phi, kmin, kmax):
+    # Please note that the CPU version has a number of optimizations with respect
+    # to this.
+    # FIXME: Convert to ElementwiseKernel and use kmin and max in that.
+    idx = xp.arange(len(freqseries))
+    phase_shift = xp.exp(phi * idx)
+    freqseries[:] = freqseries[:] * phase_shift 
+
diff --git a/pycbc/waveform/waveform.py b/pycbc/waveform/waveform.py
index dd2476f4c78..628e97044ab 100644
--- a/pycbc/waveform/waveform.py
+++ b/pycbc/waveform/waveform.py
@@ -988,13 +988,16 @@ def get_sgburst_waveform(template=None, **kwargs):
 # Organize Filter Generators
 _inspiral_fd_filters = {}
 _cuda_fd_filters = {}
+_cupy_fd_filters = {}
 
 _cuda_fd_filters['SPAtmplt'] = spa_tmplt
+_cupy_fd_filters['SPAtmplt'] = spa_tmplt
 _inspiral_fd_filters['SPAtmplt'] = spa_tmplt
 
 filter_wav = _scheme.ChooseBySchemeDict()
 filter_wav.update( {_scheme.CPUScheme:_inspiral_fd_filters,
                     _scheme.CUDAScheme:_cuda_fd_filters,
+                    _scheme.CUPYScheme:_cupy_fd_filters,
                    } )
 
 # Organize functions for function conditioning/precalculated values
diff --git a/tox.ini b/tox.ini
index 4c160e9c846..2a231fd8e05 100644
--- a/tox.ini
+++ b/tox.ini
@@ -75,6 +75,10 @@ setenv = PYCBC_TEST_TYPE=inference
 commands = bash tools/pycbc_test_suite.sh
 
 [testenv:py-docs]
+deps =
+    {[base]deps}
+    {[bbhx]deps}
+    cupy-cuda12x
 setenv =
     PYCBC_TEST_TYPE=docs
 commands = bash tools/pycbc_test_suite.sh