Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prototype cupy backend #4952

Merged
merged 31 commits into from
Dec 3, 2024
Merged
Show file tree
Hide file tree
Changes from 26 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
d2a79e6
Developing GPU CUPY backend
spxiwh Nov 17, 2024
f72d346
Don't import aligned!!
spxiwh Nov 17, 2024
e26afa9
Adding cupy FFT to scheme
spxiwh Nov 17, 2024
d29743a
Don't understand why copy here??
spxiwh Nov 17, 2024
670ad34
Get right memory pointer
spxiwh Nov 17, 2024
a745c8d
Add CUPY scheme to CLI
spxiwh Nov 17, 2024
e31dd1b
Typo in scheme.py
spxiwh Nov 17, 2024
5e321f3
Remove warning
spxiwh Nov 19, 2024
dd7bc60
Adding some more array stuff
spxiwh Nov 19, 2024
f0e43f2
The i is kind of important for a phase shift!!
spxiwh Nov 19, 2024
cf0d2ce
Add device num support
spxiwh Nov 19, 2024
8a76a6d
Set device number
spxiwh Nov 20, 2024
059dc9e
Add prototype spa_tmplt_cupy
spxiwh Nov 21, 2024
1e402ba
Adding new parts of CUPY backend
spxiwh Nov 22, 2024
8f5b539
Typo in scheme.py
spxiwh Nov 22, 2024
d404116
Need to force numpy in PSD estimate
spxiwh Nov 22, 2024
a273450
Add SPATmplt in waveform.py
spxiwh Nov 22, 2024
27090ba
Starting to understand RawKernel and CUDA
spxiwh Nov 24, 2024
7c463fa
Remove print statement
spxiwh Nov 24, 2024
c49ccea
Force numpy returns
spxiwh Nov 25, 2024
2d7a514
Gareth's comments
spxiwh Nov 25, 2024
e9e8c66
Try to fix test suite
spxiwh Nov 25, 2024
89843ea
Refactor LAL constants
spxiwh Nov 25, 2024
ea040b2
Probably need BBHx
spxiwh Nov 25, 2024
56bf071
Move cupy array decleration to when needed
spxiwh Nov 25, 2024
5de37e4
Typo
spxiwh Nov 25, 2024
c21f57a
Don't want time-reversed waveforms!
spxiwh Nov 28, 2024
7850921
Add FIXME
spxiwh Nov 28, 2024
62a59e5
Testing chisq optimization - DO NOT MERGE
spxiwh Nov 28, 2024
1c49374
Revert "Testing chisq optimization - DO NOT MERGE"
spxiwh Dec 2, 2024
34fb40e
Add deprecated warning
spxiwh Dec 2, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
259 changes: 259 additions & 0 deletions pycbc/events/threshold_cupy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,259 @@
# Copyright (C) 2012 Alex Nitz
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation; either version 3 of the License, or (at your
# option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
# Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.


#
# =============================================================================
#
# Preamble
#
# =============================================================================
#

import cupy as cp
import functools
import mako.template
from .eventmgr import _BaseThresholdCluster
import pycbc.scheme

val = None
loc = None

# https://stackoverflow.com/questions/77798014/cupy-rawkernel-cuda-error-not-found-named-symbol-not-found-cupy

tkernel1 = mako.template.Template("""
extern "C" __global__ void threshold_and_cluster(float2* in, float2* outv, int* outl, int window, float threshold){
int s = window * blockIdx.x;
int e = s + window;

// shared memory for chuck size candidates
__shared__ float svr[${chunk}];
__shared__ float svi[${chunk}];
__shared__ int sl[${chunk}];

// shared memory for the warp size candidates
__shared__ float svv[32];
__shared__ int idx[32];

int ml = -1;
float mvr = 0;
float mvi = 0;
float re;
float im;

// Iterate trought the entire window size chunk and find blockDim.x number
// of candidates
for (int i = s + threadIdx.x; i < e; i += blockDim.x){
re = in[i].x;
im = in[i].y;
if ((re * re + im * im) > (mvr * mvr + mvi * mvi)){
mvr = re;
mvi = im;
ml = i;
}
}

// Save the candidate from this thread to shared memory
svr[threadIdx.x] = mvr;
svi[threadIdx.x] = mvi;
sl[threadIdx.x] = ml;

__syncthreads();

if (threadIdx.x < 32){
int tl = threadIdx.x;

// Now that we have all the candiates for this chunk in shared memory
// Iterate through in the warp size to reduce to 32 candidates
for (int i = threadIdx.x; i < ${chunk}; i += 32){
re = svr[i];
im = svi[i];
if ((re * re + im * im) > (mvr * mvr + mvi * mvi)){
tl = i;
mvr = re;
mvi = im;
}
}

// Store the 32 candidates into shared memory
svv[threadIdx.x] = svr[tl] * svr[tl] + svi[tl] * svi[tl];
idx[threadIdx.x] = tl;

// Find the 1 candidate we are looking for using a manual log algorithm
if ((threadIdx.x < 16) && (svv[threadIdx.x] < svv[threadIdx.x + 16])){
svv[threadIdx.x] = svv[threadIdx.x + 16];
idx[threadIdx.x] = idx[threadIdx.x + 16];
}

if ((threadIdx.x < 8) && (svv[threadIdx.x] < svv[threadIdx.x + 8])){
svv[threadIdx.x] = svv[threadIdx.x + 8];
idx[threadIdx.x] = idx[threadIdx.x + 8];
}

if ((threadIdx.x < 4) && (svv[threadIdx.x] < svv[threadIdx.x + 4])){
svv[threadIdx.x] = svv[threadIdx.x + 4];
idx[threadIdx.x] = idx[threadIdx.x + 4];
}

if ((threadIdx.x < 2) && (svv[threadIdx.x] < svv[threadIdx.x + 2])){
svv[threadIdx.x] = svv[threadIdx.x + 2];
idx[threadIdx.x] = idx[threadIdx.x + 2];
}


// Save the 1 candidate maximum and location to the output vectors
if (threadIdx.x == 0){
if (svv[threadIdx.x] < svv[threadIdx.x + 1]){
idx[0] = idx[1];
svv[0] = svv[1];
}

if (svv[0] > threshold){
tl = idx[0];
outv[blockIdx.x].x = svr[tl];
outv[blockIdx.x].y = svi[tl];
outl[blockIdx.x] = sl[tl];
} else{
outl[blockIdx.x] = -1;
}
}
}
}
""")

tkernel2 = mako.template.Template("""
extern "C" __global__ void threshold_and_cluster2(float2* outv, int* outl, float threshold, int window){
__shared__ int loc[${blocks}];
__shared__ float val[${blocks}];

int i = threadIdx.x;

int l = outl[i];
loc[i] = l;

if (l == -1)
return;

val[i] = outv[i].x * outv[i].x + outv[i].y * outv[i].y;


// Check right
if ( (i < (${blocks} - 1)) && (val[i + 1] > val[i]) ){
outl[i] = -1;
return;
}

// Check left
if ( (i > 0) && (val[i - 1] > val[i]) ){
outl[i] = -1;
return;
}
}
""")

@functools.lru_cache(maxsize=None)
def get_tkernel(slen, window):
if window < 32:
raise ValueError("GPU threshold kernel does not support a window smaller than 32 samples")

elif window <= 4096:
nt = 128
elif window <= 16384:
nt = 256
elif window <= 32768:
nt = 512
else:
nt = 1024

nb = int(cp.ceil(slen / float(window)))

if nb > 1024:
raise ValueError("More than 1024 blocks not supported yet")

fn = cp.RawKernel(
tkernel1.render(chunk=nt),
'threshold_and_cluster',
backend='nvcc'
)
fn2 = cp.RawKernel(
tkernel2.render(blocks=nb),
'threshold_and_cluster2',
backend='nvcc'
)
return (fn, fn2), nt, nb

def threshold_and_cluster(series, threshold, window):
global val
global loc
if val is None:
val = cp.zeros(4096*256, dtype=cp.complex64)
if loc is None:
loc = cp.zeros(4096*256, cp.int32)

outl = loc
outv = val
slen = len(series)
series = series.data
(fn, fn2), nt, nb = get_tkernel(slen, window)
threshold = cp.float32(threshold * threshold)
window = cp.int32(window)

cl = loc[0:nb]
cv = val[0:nb]

fn((nb,), (nt,), (series.data, outv, outl, window, threshold))
fn2((1,), (nb,), (outv, outl, threshold, window))
w = (cl != -1)
return cv[w], cl[w]

class CUDAThresholdCluster(_BaseThresholdCluster):
def __init__(self, series):
self.series = series

global val
global loc
if val is None:
val = cp.zeros(4096*256, dtype=cp.complex64)
if loc is None:
loc = cp.zeros(4096*256, cp.int32)

self.outl = loc
self.outv = val
self.slen = len(series)

def threshold_and_cluster(self, threshold, window):
threshold = cp.float32(threshold * threshold)
window = cp.int32(window)

(fn, fn2), nt, nb = get_tkernel(self.slen, window)
cl = loc[0:nb]
cv = val[0:nb]

fn(
(nt, 1, 1),
(nb, 1),
(self.series.data, self.outv, self.outl, window, threshold)
)
fn2(
(nb, 1, 1),
(1, 1),
(self.outv, self.outl, threshold, window)
)
w = (cl != -1)
return cp.asnumpy(cv[w]), cp.asnumpy(cl[w])

def _threshold_cluster_factory(series):
return CUDAThresholdCluster

37 changes: 37 additions & 0 deletions pycbc/fft/backend_cupy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Copyright (C) 2024 Y Ddraig Goch
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with with program; see the file COPYING. If not, write to the
# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
# MA 02111-1307 USA

from .core import _list_available

_backend_dict = {'cupy' : 'cupyfft'}
_backend_list = ['cupy']

_alist, _adict = _list_available(_backend_list, _backend_dict)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The backend_cuda version of this has the if pycbc.HAVE_CUDA statement and this doesn't. This makes me think, should this backend work when not on a GPU?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that's used to stop the tests failing here when no GPU is present by not loading any CUDA module ... I'll probably need this, but it might be possible to have the documentation and help text run for the cupy backend, even if the code isn't going to work.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(Clearly the tests do need to pass before merging).


cupy_backend = None

def set_backend(backend_list):
global cupy_backend
for backend in backend_list:
if backend in _alist:
cupy_backend = backend
break

def get_backend():
return _adict[cupy_backend]

set_backend(_backend_list)
2 changes: 1 addition & 1 deletion pycbc/fft/backend_support.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def get_backend():

# Import all scheme-dependent backends, to get _all_backends accurate:

for scheme_name in ["cpu", "mkl", "cuda"]:
for scheme_name in ["cpu", "mkl", "cuda", "cupy"]:
try:
mod = __import__('pycbc.fft.backend_' + scheme_name, fromlist = ['_alist', '_adict'])
_alist = getattr(mod, "_alist")
Expand Down
88 changes: 88 additions & 0 deletions pycbc/fft/cupyfft.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# Copyright (C) 2024 Y Ddraig Goch
#
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation; either version 3 of the License, or (at your
# option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
# Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.

#
# =============================================================================
#
# Preamble
#
# =============================================================================
#
"""
This module provides the cupy backend of the fast Fourier transform
for the PyCBC package.
"""

import logging
import cupy.fft
from .core import _check_fft_args
from .core import _BaseFFT, _BaseIFFT

_INV_FFT_MSG = ("I cannot perform an {} between data with an input type of "
"{} and an output type of {}")

def fft(invec, outvec, _, itype, otype):
if invec.ptr == outvec.ptr:
raise NotImplementedError("cupy backend of pycbc.fft does not "
"support in-place transforms")
if itype == 'complex' and otype == 'complex':
outvec.data[:] = cupy.asarray(cupy.fft.fft(invec.data),
dtype=outvec.dtype)
elif itype == 'real' and otype == 'complex':
outvec.data[:] = cupy.asarray(cupy.fft.rfft(invec.data),
dtype=outvec.dtype)
else:
raise ValueError(_INV_FFT_MSG.format("FFT", itype, otype))


def ifft(invec, outvec, _, itype, otype):
if invec.ptr == outvec.ptr:
raise NotImplementedError("cupy backend of pycbc.fft does not "
"support in-place transforms")
if itype == 'complex' and otype == 'complex':
outvec.data[:] = cupy.asarray(cupy.fft.ifft(invec.data),
dtype=outvec.dtype)
outvec *= len(outvec)
elif itype == 'complex' and otype == 'real':
outvec.data[:] = cupy.asarray(cupy.fft.irfft(invec.data,len(outvec)),
dtype=outvec.dtype)
outvec *= len(outvec)
else:
raise ValueError(_INV_FFT_MSG.format("IFFT", itype, otype))


Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be good to have something similar to the numpy warning, i.e "The cupy backend is a prototype, and performance may not be as expected"

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This isn't at the same level. The numpy FFT backend is really bad. It's not clear that's the same for cupy. I've not really seen things limited by the memory allocation. One might want a warning in the scheme initialization that things are not great yet, but I think it doesn't belong here.

class FFT(_BaseFFT):
"""
Class for performing FFTs via the cupy interface.
"""
def __init__(self, invec, outvec, nbatch=1, size=None):
super(FFT, self).__init__(invec, outvec, nbatch, size)
self.prec, self.itype, self.otype = _check_fft_args(invec, outvec)

def execute(self):
fft(self.invec, self.outvec, self.prec, self.itype, self.otype)


class IFFT(_BaseIFFT):
"""
Class for performing IFFTs via the cupy interface.
"""
def __init__(self, invec, outvec, nbatch=1, size=None):
super(IFFT, self).__init__(invec, outvec, nbatch, size)
self.prec, self.itype, self.otype = _check_fft_args(invec, outvec)

def execute(self):
ifft(self.invec, self.outvec, self.prec, self.itype, self.otype)
Loading
Loading