-
Notifications
You must be signed in to change notification settings - Fork 359
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Prototype cupy backend #4952
Prototype cupy backend #4952
Changes from 26 commits
d2a79e6
f72d346
e26afa9
d29743a
670ad34
a745c8d
e31dd1b
5e321f3
dd7bc60
f0e43f2
cf0d2ce
8a76a6d
059dc9e
1e402ba
8f5b539
d404116
a273450
27090ba
7c463fa
c49ccea
2d7a514
e9e8c66
89843ea
ea040b2
56bf071
5de37e4
c21f57a
7850921
62a59e5
1c49374
34fb40e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,259 @@ | ||
# Copyright (C) 2012 Alex Nitz | ||
# This program is free software; you can redistribute it and/or modify it | ||
# under the terms of the GNU General Public License as published by the | ||
# Free Software Foundation; either version 3 of the License, or (at your | ||
# option) any later version. | ||
# | ||
# This program is distributed in the hope that it will be useful, but | ||
# WITHOUT ANY WARRANTY; without even the implied warranty of | ||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General | ||
# Public License for more details. | ||
# | ||
# You should have received a copy of the GNU General Public License along | ||
# with this program; if not, write to the Free Software Foundation, Inc., | ||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | ||
|
||
|
||
# | ||
# ============================================================================= | ||
# | ||
# Preamble | ||
# | ||
# ============================================================================= | ||
# | ||
|
||
import cupy as cp | ||
import functools | ||
import mako.template | ||
from .eventmgr import _BaseThresholdCluster | ||
import pycbc.scheme | ||
|
||
val = None | ||
loc = None | ||
|
||
# https://stackoverflow.com/questions/77798014/cupy-rawkernel-cuda-error-not-found-named-symbol-not-found-cupy | ||
|
||
tkernel1 = mako.template.Template(""" | ||
extern "C" __global__ void threshold_and_cluster(float2* in, float2* outv, int* outl, int window, float threshold){ | ||
int s = window * blockIdx.x; | ||
int e = s + window; | ||
|
||
// shared memory for chuck size candidates | ||
__shared__ float svr[${chunk}]; | ||
__shared__ float svi[${chunk}]; | ||
__shared__ int sl[${chunk}]; | ||
|
||
// shared memory for the warp size candidates | ||
__shared__ float svv[32]; | ||
__shared__ int idx[32]; | ||
|
||
int ml = -1; | ||
float mvr = 0; | ||
float mvi = 0; | ||
float re; | ||
float im; | ||
|
||
// Iterate trought the entire window size chunk and find blockDim.x number | ||
// of candidates | ||
for (int i = s + threadIdx.x; i < e; i += blockDim.x){ | ||
re = in[i].x; | ||
im = in[i].y; | ||
if ((re * re + im * im) > (mvr * mvr + mvi * mvi)){ | ||
mvr = re; | ||
mvi = im; | ||
ml = i; | ||
} | ||
} | ||
|
||
// Save the candidate from this thread to shared memory | ||
svr[threadIdx.x] = mvr; | ||
svi[threadIdx.x] = mvi; | ||
sl[threadIdx.x] = ml; | ||
|
||
__syncthreads(); | ||
|
||
if (threadIdx.x < 32){ | ||
int tl = threadIdx.x; | ||
|
||
// Now that we have all the candiates for this chunk in shared memory | ||
// Iterate through in the warp size to reduce to 32 candidates | ||
for (int i = threadIdx.x; i < ${chunk}; i += 32){ | ||
re = svr[i]; | ||
im = svi[i]; | ||
if ((re * re + im * im) > (mvr * mvr + mvi * mvi)){ | ||
tl = i; | ||
mvr = re; | ||
mvi = im; | ||
} | ||
} | ||
|
||
// Store the 32 candidates into shared memory | ||
svv[threadIdx.x] = svr[tl] * svr[tl] + svi[tl] * svi[tl]; | ||
idx[threadIdx.x] = tl; | ||
|
||
// Find the 1 candidate we are looking for using a manual log algorithm | ||
if ((threadIdx.x < 16) && (svv[threadIdx.x] < svv[threadIdx.x + 16])){ | ||
svv[threadIdx.x] = svv[threadIdx.x + 16]; | ||
idx[threadIdx.x] = idx[threadIdx.x + 16]; | ||
} | ||
|
||
if ((threadIdx.x < 8) && (svv[threadIdx.x] < svv[threadIdx.x + 8])){ | ||
svv[threadIdx.x] = svv[threadIdx.x + 8]; | ||
idx[threadIdx.x] = idx[threadIdx.x + 8]; | ||
} | ||
|
||
if ((threadIdx.x < 4) && (svv[threadIdx.x] < svv[threadIdx.x + 4])){ | ||
svv[threadIdx.x] = svv[threadIdx.x + 4]; | ||
idx[threadIdx.x] = idx[threadIdx.x + 4]; | ||
} | ||
|
||
if ((threadIdx.x < 2) && (svv[threadIdx.x] < svv[threadIdx.x + 2])){ | ||
svv[threadIdx.x] = svv[threadIdx.x + 2]; | ||
idx[threadIdx.x] = idx[threadIdx.x + 2]; | ||
} | ||
|
||
|
||
// Save the 1 candidate maximum and location to the output vectors | ||
if (threadIdx.x == 0){ | ||
if (svv[threadIdx.x] < svv[threadIdx.x + 1]){ | ||
idx[0] = idx[1]; | ||
svv[0] = svv[1]; | ||
} | ||
|
||
if (svv[0] > threshold){ | ||
tl = idx[0]; | ||
outv[blockIdx.x].x = svr[tl]; | ||
outv[blockIdx.x].y = svi[tl]; | ||
outl[blockIdx.x] = sl[tl]; | ||
} else{ | ||
outl[blockIdx.x] = -1; | ||
} | ||
} | ||
} | ||
} | ||
""") | ||
|
||
tkernel2 = mako.template.Template(""" | ||
extern "C" __global__ void threshold_and_cluster2(float2* outv, int* outl, float threshold, int window){ | ||
__shared__ int loc[${blocks}]; | ||
__shared__ float val[${blocks}]; | ||
|
||
int i = threadIdx.x; | ||
|
||
int l = outl[i]; | ||
loc[i] = l; | ||
|
||
if (l == -1) | ||
return; | ||
|
||
val[i] = outv[i].x * outv[i].x + outv[i].y * outv[i].y; | ||
|
||
|
||
// Check right | ||
if ( (i < (${blocks} - 1)) && (val[i + 1] > val[i]) ){ | ||
outl[i] = -1; | ||
return; | ||
} | ||
|
||
// Check left | ||
if ( (i > 0) && (val[i - 1] > val[i]) ){ | ||
outl[i] = -1; | ||
return; | ||
} | ||
} | ||
""") | ||
|
||
@functools.lru_cache(maxsize=None) | ||
def get_tkernel(slen, window): | ||
if window < 32: | ||
raise ValueError("GPU threshold kernel does not support a window smaller than 32 samples") | ||
|
||
elif window <= 4096: | ||
nt = 128 | ||
elif window <= 16384: | ||
nt = 256 | ||
elif window <= 32768: | ||
nt = 512 | ||
else: | ||
nt = 1024 | ||
|
||
nb = int(cp.ceil(slen / float(window))) | ||
|
||
if nb > 1024: | ||
raise ValueError("More than 1024 blocks not supported yet") | ||
|
||
fn = cp.RawKernel( | ||
tkernel1.render(chunk=nt), | ||
'threshold_and_cluster', | ||
backend='nvcc' | ||
) | ||
fn2 = cp.RawKernel( | ||
tkernel2.render(blocks=nb), | ||
'threshold_and_cluster2', | ||
backend='nvcc' | ||
) | ||
return (fn, fn2), nt, nb | ||
|
||
def threshold_and_cluster(series, threshold, window): | ||
global val | ||
global loc | ||
if val is None: | ||
val = cp.zeros(4096*256, dtype=cp.complex64) | ||
if loc is None: | ||
loc = cp.zeros(4096*256, cp.int32) | ||
|
||
outl = loc | ||
outv = val | ||
slen = len(series) | ||
series = series.data | ||
(fn, fn2), nt, nb = get_tkernel(slen, window) | ||
threshold = cp.float32(threshold * threshold) | ||
window = cp.int32(window) | ||
|
||
cl = loc[0:nb] | ||
cv = val[0:nb] | ||
|
||
fn((nb,), (nt,), (series.data, outv, outl, window, threshold)) | ||
fn2((1,), (nb,), (outv, outl, threshold, window)) | ||
w = (cl != -1) | ||
return cv[w], cl[w] | ||
|
||
class CUDAThresholdCluster(_BaseThresholdCluster): | ||
def __init__(self, series): | ||
self.series = series | ||
|
||
global val | ||
global loc | ||
if val is None: | ||
val = cp.zeros(4096*256, dtype=cp.complex64) | ||
if loc is None: | ||
loc = cp.zeros(4096*256, cp.int32) | ||
|
||
self.outl = loc | ||
self.outv = val | ||
self.slen = len(series) | ||
|
||
def threshold_and_cluster(self, threshold, window): | ||
threshold = cp.float32(threshold * threshold) | ||
window = cp.int32(window) | ||
|
||
(fn, fn2), nt, nb = get_tkernel(self.slen, window) | ||
cl = loc[0:nb] | ||
cv = val[0:nb] | ||
|
||
fn( | ||
(nt, 1, 1), | ||
(nb, 1), | ||
(self.series.data, self.outv, self.outl, window, threshold) | ||
) | ||
fn2( | ||
(nb, 1, 1), | ||
(1, 1), | ||
(self.outv, self.outl, threshold, window) | ||
) | ||
w = (cl != -1) | ||
return cp.asnumpy(cv[w]), cp.asnumpy(cl[w]) | ||
|
||
def _threshold_cluster_factory(series): | ||
return CUDAThresholdCluster | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
# Copyright (C) 2024 Y Ddraig Goch | ||
# | ||
# This program is free software; you can redistribute it and/or modify | ||
# it under the terms of the GNU General Public License as published by | ||
# the Free Software Foundation; either version 2 of the License, or | ||
# (at your option) any later version. | ||
# | ||
# This program is distributed in the hope that it will be useful, | ||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
# GNU General Public License for more details. | ||
# | ||
# You should have received a copy of the GNU General Public License | ||
# along with with program; see the file COPYING. If not, write to the | ||
# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, | ||
# MA 02111-1307 USA | ||
|
||
from .core import _list_available | ||
|
||
_backend_dict = {'cupy' : 'cupyfft'} | ||
_backend_list = ['cupy'] | ||
|
||
_alist, _adict = _list_available(_backend_list, _backend_dict) | ||
|
||
cupy_backend = None | ||
|
||
def set_backend(backend_list): | ||
global cupy_backend | ||
for backend in backend_list: | ||
if backend in _alist: | ||
cupy_backend = backend | ||
break | ||
|
||
def get_backend(): | ||
return _adict[cupy_backend] | ||
|
||
set_backend(_backend_list) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
# Copyright (C) 2024 Y Ddraig Goch | ||
# | ||
# This program is free software; you can redistribute it and/or modify it | ||
# under the terms of the GNU General Public License as published by the | ||
# Free Software Foundation; either version 3 of the License, or (at your | ||
# option) any later version. | ||
# | ||
# This program is distributed in the hope that it will be useful, but | ||
# WITHOUT ANY WARRANTY; without even the implied warranty of | ||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General | ||
# Public License for more details. | ||
# | ||
# You should have received a copy of the GNU General Public License along | ||
# with this program; if not, write to the Free Software Foundation, Inc., | ||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | ||
|
||
# | ||
# ============================================================================= | ||
# | ||
# Preamble | ||
# | ||
# ============================================================================= | ||
# | ||
""" | ||
This module provides the cupy backend of the fast Fourier transform | ||
for the PyCBC package. | ||
""" | ||
|
||
import logging | ||
import cupy.fft | ||
from .core import _check_fft_args | ||
from .core import _BaseFFT, _BaseIFFT | ||
|
||
_INV_FFT_MSG = ("I cannot perform an {} between data with an input type of " | ||
"{} and an output type of {}") | ||
|
||
def fft(invec, outvec, _, itype, otype): | ||
if invec.ptr == outvec.ptr: | ||
raise NotImplementedError("cupy backend of pycbc.fft does not " | ||
"support in-place transforms") | ||
if itype == 'complex' and otype == 'complex': | ||
outvec.data[:] = cupy.asarray(cupy.fft.fft(invec.data), | ||
dtype=outvec.dtype) | ||
elif itype == 'real' and otype == 'complex': | ||
outvec.data[:] = cupy.asarray(cupy.fft.rfft(invec.data), | ||
dtype=outvec.dtype) | ||
else: | ||
raise ValueError(_INV_FFT_MSG.format("FFT", itype, otype)) | ||
|
||
|
||
def ifft(invec, outvec, _, itype, otype): | ||
if invec.ptr == outvec.ptr: | ||
raise NotImplementedError("cupy backend of pycbc.fft does not " | ||
"support in-place transforms") | ||
if itype == 'complex' and otype == 'complex': | ||
outvec.data[:] = cupy.asarray(cupy.fft.ifft(invec.data), | ||
dtype=outvec.dtype) | ||
outvec *= len(outvec) | ||
elif itype == 'complex' and otype == 'real': | ||
outvec.data[:] = cupy.asarray(cupy.fft.irfft(invec.data,len(outvec)), | ||
dtype=outvec.dtype) | ||
outvec *= len(outvec) | ||
else: | ||
raise ValueError(_INV_FFT_MSG.format("IFFT", itype, otype)) | ||
|
||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be good to have something similar to the numpy warning, i.e "The cupy backend is a prototype, and performance may not be as expected" There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This isn't at the same level. The numpy FFT backend is really bad. It's not clear that's the same for cupy. I've not really seen things limited by the memory allocation. One might want a warning in the scheme initialization that things are not great yet, but I think it doesn't belong here. |
||
class FFT(_BaseFFT): | ||
""" | ||
Class for performing FFTs via the cupy interface. | ||
""" | ||
def __init__(self, invec, outvec, nbatch=1, size=None): | ||
super(FFT, self).__init__(invec, outvec, nbatch, size) | ||
self.prec, self.itype, self.otype = _check_fft_args(invec, outvec) | ||
|
||
def execute(self): | ||
fft(self.invec, self.outvec, self.prec, self.itype, self.otype) | ||
|
||
|
||
class IFFT(_BaseIFFT): | ||
""" | ||
Class for performing IFFTs via the cupy interface. | ||
""" | ||
def __init__(self, invec, outvec, nbatch=1, size=None): | ||
super(IFFT, self).__init__(invec, outvec, nbatch, size) | ||
self.prec, self.itype, self.otype = _check_fft_args(invec, outvec) | ||
|
||
def execute(self): | ||
ifft(self.invec, self.outvec, self.prec, self.itype, self.otype) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
backend_cuda
version of this has theif pycbc.HAVE_CUDA
statement and this doesn't. This makes me think, should this backend work when not on a GPU?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think that's used to stop the tests failing here when no GPU is present by not loading any CUDA module ... I'll probably need this, but it might be possible to have the documentation and help text run for the cupy backend, even if the code isn't going to work.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(Clearly the tests do need to pass before merging).