From 748a130a573c424306fe8a6666d477d744aa56eb Mon Sep 17 00:00:00 2001 From: David Rowenhorst Date: Tue, 28 May 2024 17:52:59 -0400 Subject: [PATCH 1/3] Correct write for windows ... should be faster. Signed-off by: David Rowenhorst --- pyebsdindex/opencl/nlpar_cl.py | 10 ++++++++-- pyebsdindex/opencl/nlpar_clray.py | 9 +++++---- pyebsdindex/opencl/openclparam.py | 2 +- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/pyebsdindex/opencl/nlpar_cl.py b/pyebsdindex/opencl/nlpar_cl.py index d0d5211..42a7671 100644 --- a/pyebsdindex/opencl/nlpar_cl.py +++ b/pyebsdindex/opencl/nlpar_cl.py @@ -264,6 +264,7 @@ def calcsigma_cl(self,nn=1,saturation_protect=True,automask=True, normalize_d=Fa sigmachunk_gpu = cl.Buffer(ctx, mf.WRITE_ONLY, size=sigmachunk.nbytes) + cl.enqueue_barrier(queue) prg.calcsigma(queue, (np.uint32(ncolchunk), np.uint32(nrowchunk)), None, datapad_gpu, mask_gpu,sigmachunk_gpu, @@ -404,7 +405,7 @@ def calcnlpar_cl(self, searchradius=None, lam = None, dthresh = None, saturation clvectlen = 16 - + # print("target mem:", target_mem) chunks = self._calcchunks( [pwidth, pheight], ncols, nrows, target_bytes=target_mem, col_overlap=sr, row_overlap=sr) #print(chunks[2], chunks[3]) @@ -426,10 +427,14 @@ def calcnlpar_cl(self, searchradius=None, lam = None, dthresh = None, saturation nchunks = chunksize.size #return chunks, chunksize mxchunk = int(chunksize.max()) + # print("max chunk:" , mxchunk) + npadmx = clvectlen * int(np.ceil(float(mxchunk)*npat_point/ clvectlen)) datapad_gpu = cl.Buffer(ctx, mf.READ_WRITE, size=int(npadmx) * int(4)) datapadout_gpu = cl.Buffer(ctx, mf.READ_WRITE, size=int(npadmx) * int(4)) + # print("data pad", datapad_gpu.size) + # print("data out", datapadout_gpu.size) nnn = int((2 * sr + 1) ** 2) @@ -469,6 +474,7 @@ def calcnlpar_cl(self, searchradius=None, lam = None, dthresh = None, saturation sigmachunk = np.ascontiguousarray(sigma[rstart:rend, cstart:cend].astype(np.float32)) sigmachunk_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=sigmachunk) + # print("sigma", sigmachunk_gpu.size) szdata = data.size npad = clvectlen * int(np.ceil(szdata / clvectlen)) @@ -476,7 +482,7 @@ def calcnlpar_cl(self, searchradius=None, lam = None, dthresh = None, saturation #datapad[0:szdata] = data.reshape(-1) data_gpu = cl.Buffer(ctx,mf.READ_ONLY | mf.COPY_HOST_PTR,hostbuf=data) - + # print("data", data_gpu.size) if data.dtype.type is np.float32: prg.nlloadpat32flt(queue, (np.uint64(data.size),1), None, data_gpu, datapad_gpu, wait_for=[filldatain]) if data.dtype.type is np.ubyte: diff --git a/pyebsdindex/opencl/nlpar_clray.py b/pyebsdindex/opencl/nlpar_clray.py index 87a1fc8..98f003a 100644 --- a/pyebsdindex/opencl/nlpar_clray.py +++ b/pyebsdindex/opencl/nlpar_clray.py @@ -110,7 +110,7 @@ def calcsigma_clray(self, nn=1, saturation_protect=True, automask=True, normaliz cudavis += str(cdgpu) + ',' #print(gpu_id) - ngpuwrker = 6 + ngpuwrker = 4 clparams.get_context(gpu_id=gpu_id, kfile = 'clnlpar.cl') clparams.get_queue() if clparams.gpu[gpu_id].host_unified_memory: @@ -119,9 +119,10 @@ def calcsigma_clray(self, nn=1, saturation_protect=True, automask=True, normaliz normalize_d=normalize_d, gpu_id=gpu_id, **kwargs) - target_mem = clparams.gpu[gpu_id].max_mem_alloc_size // 3 + target_mem = clparams.gpu[gpu_id].max_mem_alloc_size // 2 max_mem = clparams.gpu[gpu_id].global_mem_size * 0.75 if target_mem * ngpuwrker > max_mem: + print('revisemem:') target_mem = max_mem / ngpuwrker patternfile = self.getinfileobj() @@ -445,7 +446,7 @@ def calcnlpar_clray(self, searchradius=None, lam = None, dthresh = None, saturat else: # not int, so no rescale. self.rescale = False - ngpuwrker = 6 + ngpuwrker = 4 clparams = openclparam.OpenClParam() clparams.get_gpu() if gpu_id is None: @@ -545,7 +546,7 @@ def calcnlpar_clray(self, searchradius=None, lam = None, dthresh = None, saturat if len(jobqueue) > 0: if len(idlewrker) > 0: wrker = idlewrker.pop() - job = jobqueue.pop() + job = jobqueue.pop(0) tasks.append(wrker.runnlpar_chunk.remote(job, nlparobj=nlpar_remote)) busywrker.append(wrker) diff --git a/pyebsdindex/opencl/openclparam.py b/pyebsdindex/opencl/openclparam.py index 5552842..28f77f8 100644 --- a/pyebsdindex/opencl/openclparam.py +++ b/pyebsdindex/opencl/openclparam.py @@ -25,7 +25,7 @@ from os import path import pyopencl as cl from os import environ -environ['PYOPENCL_COMPILER_OUTPUT'] = '1' +environ['PYOPENCL_COMPILER_OUTPUT'] = '0' RADDEG = 180.0/np.pi DEGRAD = np.pi/180.0 From 5a3e9c69de2e31e6431bc99499dcf21951dca543 Mon Sep 17 00:00:00 2001 From: David Rowenhorst Date: Wed, 29 May 2024 17:22:10 -0400 Subject: [PATCH 2/3] Better memory management. Signed-off by: David Rowenhorst --- pyebsdindex/opencl/nlpar_clray.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/pyebsdindex/opencl/nlpar_clray.py b/pyebsdindex/opencl/nlpar_clray.py index 98f003a..564129e 100644 --- a/pyebsdindex/opencl/nlpar_clray.py +++ b/pyebsdindex/opencl/nlpar_clray.py @@ -110,7 +110,7 @@ def calcsigma_clray(self, nn=1, saturation_protect=True, automask=True, normaliz cudavis += str(cdgpu) + ',' #print(gpu_id) - ngpuwrker = 4 + ngpuwrker = 6 clparams.get_context(gpu_id=gpu_id, kfile = 'clnlpar.cl') clparams.get_queue() if clparams.gpu[gpu_id].host_unified_memory: @@ -120,9 +120,9 @@ def calcsigma_clray(self, nn=1, saturation_protect=True, automask=True, normaliz gpu_id=gpu_id, **kwargs) target_mem = clparams.gpu[gpu_id].max_mem_alloc_size // 2 - max_mem = clparams.gpu[gpu_id].global_mem_size * 0.75 + max_mem = clparams.gpu[gpu_id].global_mem_size * 0.5 if target_mem * ngpuwrker > max_mem: - print('revisemem:') + #print('revisemem:') target_mem = max_mem / ngpuwrker patternfile = self.getinfileobj() @@ -446,7 +446,7 @@ def calcnlpar_clray(self, searchradius=None, lam = None, dthresh = None, saturat else: # not int, so no rescale. self.rescale = False - ngpuwrker = 4 + ngpuwrker = 6 clparams = openclparam.OpenClParam() clparams.get_gpu() if gpu_id is None: @@ -480,7 +480,7 @@ def calcnlpar_clray(self, searchradius=None, lam = None, dthresh = None, saturat gpu_id= gpu_id) target_mem = clparams.gpu[gpu_id].max_mem_alloc_size//3 - max_mem = clparams.gpu[gpu_id].global_mem_size*0.75 + max_mem = clparams.gpu[gpu_id].global_mem_size*0.4 if target_mem*ngpuwrker > max_mem: target_mem = max_mem/ngpuwrker #print(target_mem/1.0e9) @@ -562,6 +562,10 @@ def calcnlpar_clray(self, searchradius=None, lam = None, dthresh = None, saturat ndone += 1 if verbose >= 2: print("tiles complete: ", ndone, "/", njobs, sep='', end='\r') + else: #An error has occured ... hopefully just need a re-process. + jobqueue.append(job) + print(message) + if verbose >= 2: print('\n', end='') return str(self.patternfileout.filepath) From ce0bc636539faa7ebb0e62849c9e143977923f50 Mon Sep 17 00:00:00 2001 From: David Rowenhorst Date: Thu, 30 May 2024 17:22:23 -0400 Subject: [PATCH 3/3] Correct issue when non-numpy array patterns (like Dask arrays) are sent to be indexed. Signed-off by: David Rowenhorst --- pyebsdindex/opencl/band_detect_cl.py | 2 ++ pyebsdindex/radon_fast.py | 3 +++ 2 files changed, 5 insertions(+) diff --git a/pyebsdindex/opencl/band_detect_cl.py b/pyebsdindex/opencl/band_detect_cl.py index f849224..77f79a2 100644 --- a/pyebsdindex/opencl/band_detect_cl.py +++ b/pyebsdindex/opencl/band_detect_cl.py @@ -225,6 +225,8 @@ def find_bands(self, patternsIn, verbose=0, clparams=None, chunksize=528, useCPU def radon_fasterCL(self,image,padding = np.array([0,0]), fixArtifacts = False, background = None, returnBuff = True, clparams=None ): # this function executes the radon sumations on the GPU tic = timer() + image = np.asarray(image) + # make sure we have an OpenCL environment if clparams is not None: if clparams.queue is None: diff --git a/pyebsdindex/radon_fast.py b/pyebsdindex/radon_fast.py index 60e19ec..7ce1534 100644 --- a/pyebsdindex/radon_fast.py +++ b/pyebsdindex/radon_fast.py @@ -235,6 +235,7 @@ def radon_fast(self, imageIn, padding = np.array([0,0]), fixArtifacts = False, def radon_faster(self,imageIn,padding = np.array([0,0]), fixArtifacts = False, background = None, normalization=True): tic = timer() + shapeIm = np.shape(imageIn) if imageIn.ndim == 2: nIm = 1 @@ -244,11 +245,13 @@ def radon_faster(self,imageIn,padding = np.array([0,0]), fixArtifacts = False, b nIm = shapeIm[0] # reform = False + if background is None: image = (imageIn.reshape(-1)).astype(np.float32) else: image = imageIn - background image = (image.reshape(-1)).astype(np.float32) + image = np.asarray(image) nPx = shapeIm[-1]*shapeIm[-2] indxDim = np.asarray(self.indexPlan.shape)