From 748a130a573c424306fe8a6666d477d744aa56eb Mon Sep 17 00:00:00 2001
From: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
Date: Tue, 28 May 2024 17:52:59 -0400
Subject: [PATCH 1/3] Correct write for windows ... should be faster.

Signed-off by: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
---
 pyebsdindex/opencl/nlpar_cl.py    | 10 ++++++++--
 pyebsdindex/opencl/nlpar_clray.py |  9 +++++----
 pyebsdindex/opencl/openclparam.py |  2 +-
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/pyebsdindex/opencl/nlpar_cl.py b/pyebsdindex/opencl/nlpar_cl.py
index d0d5211..42a7671 100644
--- a/pyebsdindex/opencl/nlpar_cl.py
+++ b/pyebsdindex/opencl/nlpar_cl.py
@@ -264,6 +264,7 @@ def calcsigma_cl(self,nn=1,saturation_protect=True,automask=True, normalize_d=Fa
 
 
         sigmachunk_gpu =  cl.Buffer(ctx, mf.WRITE_ONLY, size=sigmachunk.nbytes)
+
         cl.enqueue_barrier(queue)
         prg.calcsigma(queue, (np.uint32(ncolchunk), np.uint32(nrowchunk)), None,
                                datapad_gpu, mask_gpu,sigmachunk_gpu,
@@ -404,7 +405,7 @@ def calcnlpar_cl(self, searchradius=None, lam = None, dthresh = None, saturation
     clvectlen = 16
 
 
-
+    # print("target mem:", target_mem)
     chunks = self._calcchunks( [pwidth, pheight], ncols, nrows, target_bytes=target_mem,
                               col_overlap=sr, row_overlap=sr)
     #print(chunks[2], chunks[3])
@@ -426,10 +427,14 @@ def calcnlpar_cl(self, searchradius=None, lam = None, dthresh = None, saturation
     nchunks = chunksize.size
     #return chunks, chunksize
     mxchunk = int(chunksize.max())
+    # print("max chunk:" , mxchunk)
+
     npadmx = clvectlen * int(np.ceil(float(mxchunk)*npat_point/ clvectlen))
 
     datapad_gpu = cl.Buffer(ctx, mf.READ_WRITE, size=int(npadmx) * int(4))
     datapadout_gpu = cl.Buffer(ctx, mf.READ_WRITE, size=int(npadmx) * int(4))
+    # print("data pad", datapad_gpu.size)
+    # print("data out", datapadout_gpu.size)
 
     nnn = int((2 * sr + 1) ** 2)
 
@@ -469,6 +474,7 @@ def calcnlpar_cl(self, searchradius=None, lam = None, dthresh = None, saturation
 
         sigmachunk = np.ascontiguousarray(sigma[rstart:rend, cstart:cend].astype(np.float32))
         sigmachunk_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=sigmachunk)
+        # print("sigma", sigmachunk_gpu.size)
         szdata = data.size
         npad = clvectlen * int(np.ceil(szdata / clvectlen))
 
@@ -476,7 +482,7 @@ def calcnlpar_cl(self, searchradius=None, lam = None, dthresh = None, saturation
         #datapad[0:szdata] = data.reshape(-1)
 
         data_gpu = cl.Buffer(ctx,mf.READ_ONLY | mf.COPY_HOST_PTR,hostbuf=data)
-
+        # print("data", data_gpu.size)
         if data.dtype.type is np.float32:
           prg.nlloadpat32flt(queue, (np.uint64(data.size),1), None, data_gpu, datapad_gpu, wait_for=[filldatain])
         if data.dtype.type is np.ubyte:
diff --git a/pyebsdindex/opencl/nlpar_clray.py b/pyebsdindex/opencl/nlpar_clray.py
index 87a1fc8..98f003a 100644
--- a/pyebsdindex/opencl/nlpar_clray.py
+++ b/pyebsdindex/opencl/nlpar_clray.py
@@ -110,7 +110,7 @@ def calcsigma_clray(self, nn=1, saturation_protect=True, automask=True, normaliz
       cudavis += str(cdgpu) + ','
 
     #print(gpu_id)
-    ngpuwrker = 6
+    ngpuwrker = 4
     clparams.get_context(gpu_id=gpu_id, kfile = 'clnlpar.cl')
     clparams.get_queue()
     if clparams.gpu[gpu_id].host_unified_memory:
@@ -119,9 +119,10 @@ def calcsigma_clray(self, nn=1, saturation_protect=True, automask=True, normaliz
                                          normalize_d=normalize_d,
                                          gpu_id=gpu_id, **kwargs)
 
-    target_mem = clparams.gpu[gpu_id].max_mem_alloc_size // 3
+    target_mem = clparams.gpu[gpu_id].max_mem_alloc_size // 2
     max_mem = clparams.gpu[gpu_id].global_mem_size * 0.75
     if target_mem * ngpuwrker > max_mem:
+      print('revisemem:')
       target_mem = max_mem / ngpuwrker
 
     patternfile = self.getinfileobj()
@@ -445,7 +446,7 @@ def calcnlpar_clray(self, searchradius=None, lam = None, dthresh = None, saturat
       else:  # not int, so no rescale.
         self.rescale = False
 
-    ngpuwrker = 6
+    ngpuwrker = 4
     clparams = openclparam.OpenClParam()
     clparams.get_gpu()
     if gpu_id is None:
@@ -545,7 +546,7 @@ def calcnlpar_clray(self, searchradius=None, lam = None, dthresh = None, saturat
         if len(jobqueue) > 0:
             if len(idlewrker) > 0:
                 wrker = idlewrker.pop()
-                job = jobqueue.pop()
+                job = jobqueue.pop(0)
 
                 tasks.append(wrker.runnlpar_chunk.remote(job, nlparobj=nlpar_remote))
                 busywrker.append(wrker)
diff --git a/pyebsdindex/opencl/openclparam.py b/pyebsdindex/opencl/openclparam.py
index 5552842..28f77f8 100644
--- a/pyebsdindex/opencl/openclparam.py
+++ b/pyebsdindex/opencl/openclparam.py
@@ -25,7 +25,7 @@
 from os import path
 import pyopencl as cl
 from os import environ
-environ['PYOPENCL_COMPILER_OUTPUT'] = '1'
+environ['PYOPENCL_COMPILER_OUTPUT'] = '0'
 
 RADDEG = 180.0/np.pi
 DEGRAD = np.pi/180.0

From 5a3e9c69de2e31e6431bc99499dcf21951dca543 Mon Sep 17 00:00:00 2001
From: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
Date: Wed, 29 May 2024 17:22:10 -0400
Subject: [PATCH 2/3] Better memory management.

Signed-off by: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
---
 pyebsdindex/opencl/nlpar_clray.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/pyebsdindex/opencl/nlpar_clray.py b/pyebsdindex/opencl/nlpar_clray.py
index 98f003a..564129e 100644
--- a/pyebsdindex/opencl/nlpar_clray.py
+++ b/pyebsdindex/opencl/nlpar_clray.py
@@ -110,7 +110,7 @@ def calcsigma_clray(self, nn=1, saturation_protect=True, automask=True, normaliz
       cudavis += str(cdgpu) + ','
 
     #print(gpu_id)
-    ngpuwrker = 4
+    ngpuwrker = 6
     clparams.get_context(gpu_id=gpu_id, kfile = 'clnlpar.cl')
     clparams.get_queue()
     if clparams.gpu[gpu_id].host_unified_memory:
@@ -120,9 +120,9 @@ def calcsigma_clray(self, nn=1, saturation_protect=True, automask=True, normaliz
                                          gpu_id=gpu_id, **kwargs)
 
     target_mem = clparams.gpu[gpu_id].max_mem_alloc_size // 2
-    max_mem = clparams.gpu[gpu_id].global_mem_size * 0.75
+    max_mem = clparams.gpu[gpu_id].global_mem_size * 0.5
     if target_mem * ngpuwrker > max_mem:
-      print('revisemem:')
+      #print('revisemem:')
       target_mem = max_mem / ngpuwrker
 
     patternfile = self.getinfileobj()
@@ -446,7 +446,7 @@ def calcnlpar_clray(self, searchradius=None, lam = None, dthresh = None, saturat
       else:  # not int, so no rescale.
         self.rescale = False
 
-    ngpuwrker = 4
+    ngpuwrker = 6
     clparams = openclparam.OpenClParam()
     clparams.get_gpu()
     if gpu_id is None:
@@ -480,7 +480,7 @@ def calcnlpar_clray(self, searchradius=None, lam = None, dthresh = None, saturat
                                          gpu_id= gpu_id)
 
     target_mem = clparams.gpu[gpu_id].max_mem_alloc_size//3
-    max_mem = clparams.gpu[gpu_id].global_mem_size*0.75
+    max_mem = clparams.gpu[gpu_id].global_mem_size*0.4
     if target_mem*ngpuwrker > max_mem:
       target_mem = max_mem/ngpuwrker
     #print(target_mem/1.0e9)
@@ -562,6 +562,10 @@ def calcnlpar_clray(self, searchradius=None, lam = None, dthresh = None, saturat
                   ndone += 1
                   if verbose >= 2:
                     print("tiles complete: ", ndone, "/", njobs, sep='', end='\r')
+                else: #An error has occured ... hopefully just need a re-process.
+                  jobqueue.append(job)
+                  print(message)
+
     if verbose >= 2:
       print('\n', end='')
     return str(self.patternfileout.filepath)

From ce0bc636539faa7ebb0e62849c9e143977923f50 Mon Sep 17 00:00:00 2001
From: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
Date: Thu, 30 May 2024 17:22:23 -0400
Subject: [PATCH 3/3] Correct issue when non-numpy array patterns (like Dask
 arrays) are sent to be indexed.

Signed-off by: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
---
 pyebsdindex/opencl/band_detect_cl.py | 2 ++
 pyebsdindex/radon_fast.py            | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/pyebsdindex/opencl/band_detect_cl.py b/pyebsdindex/opencl/band_detect_cl.py
index f849224..77f79a2 100644
--- a/pyebsdindex/opencl/band_detect_cl.py
+++ b/pyebsdindex/opencl/band_detect_cl.py
@@ -225,6 +225,8 @@ def find_bands(self, patternsIn, verbose=0, clparams=None, chunksize=528, useCPU
   def radon_fasterCL(self,image,padding = np.array([0,0]), fixArtifacts = False, background = None, returnBuff = True, clparams=None ):
     # this function executes the radon sumations on the GPU
     tic = timer()
+    image = np.asarray(image)
+
     # make sure we have an OpenCL environment
     if clparams is not None:
       if clparams.queue is None:
diff --git a/pyebsdindex/radon_fast.py b/pyebsdindex/radon_fast.py
index 60e19ec..7ce1534 100644
--- a/pyebsdindex/radon_fast.py
+++ b/pyebsdindex/radon_fast.py
@@ -235,6 +235,7 @@ def radon_fast(self, imageIn, padding = np.array([0,0]), fixArtifacts = False,
 
   def radon_faster(self,imageIn,padding = np.array([0,0]), fixArtifacts = False, background = None, normalization=True):
     tic = timer()
+
     shapeIm = np.shape(imageIn)
     if imageIn.ndim == 2:
       nIm = 1
@@ -244,11 +245,13 @@ def radon_faster(self,imageIn,padding = np.array([0,0]), fixArtifacts = False, b
       nIm = shapeIm[0]
     #  reform = False
 
+
     if background is None:
       image = (imageIn.reshape(-1)).astype(np.float32)
     else:
       image = imageIn - background
       image = (image.reshape(-1)).astype(np.float32)
+    image = np.asarray(image)
 
     nPx = shapeIm[-1]*shapeIm[-2]
     indxDim = np.asarray(self.indexPlan.shape)