From 531eba485f0163779f879e97e342a610e74bee13 Mon Sep 17 00:00:00 2001
From: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
Date: Sun, 9 Jun 2024 10:08:09 -0400
Subject: [PATCH 01/22] Fixed notation on number of GPUs.

Signed-off by: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
---
 pyebsdindex/opencl/openclparam.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyebsdindex/opencl/openclparam.py b/pyebsdindex/opencl/openclparam.py
index 28f77f8..fa0206d 100644
--- a/pyebsdindex/opencl/openclparam.py
+++ b/pyebsdindex/opencl/openclparam.py
@@ -88,6 +88,7 @@ def get_gpu(self):
     else:
       pass
     self.gpu = gpu
+    self.ngpu = len(gpu)
     return self.gpu
 
 

From a7162ac4f0f3311b5401b515b6835822b00b4ad3 Mon Sep 17 00:00:00 2001
From: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
Date: Mon, 10 Jun 2024 11:12:43 -0400
Subject: [PATCH 02/22] Performance tuning

Signed-off by: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
---
 pyebsdindex/_ebsd_index_parallel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyebsdindex/_ebsd_index_parallel.py b/pyebsdindex/_ebsd_index_parallel.py
index 2be1994..dcdce45 100644
--- a/pyebsdindex/_ebsd_index_parallel.py
+++ b/pyebsdindex/_ebsd_index_parallel.py
@@ -309,7 +309,7 @@ def index_pats_distributed(
 
 
     if ngpu > 0:
-        ngpupro = max(12, ngpu*8)  # number of processes that will serve data to the gpu
+        ngpupro = min(max(6, ngpu*6), 12)  # number of processes that will serve data to the gpu
         #ngpupro = 8
         if n_cpu_nodes < 8:
             ngpupro = min(ngpupro,8)

From d742733279c77fa28a13122b4fc0d2bed4f6e805 Mon Sep 17 00:00:00 2001
From: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
Date: Mon, 10 Jun 2024 17:51:14 -0400
Subject: [PATCH 03/22] Attempt to fix Apple M-series and NLPAR gpu

Signed-off by: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
---
 pyebsdindex/_ebsd_index_parallel.py | 2 +-
 pyebsdindex/opencl/nlpar_cl.py      | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pyebsdindex/_ebsd_index_parallel.py b/pyebsdindex/_ebsd_index_parallel.py
index dcdce45..cf4a64f 100644
--- a/pyebsdindex/_ebsd_index_parallel.py
+++ b/pyebsdindex/_ebsd_index_parallel.py
@@ -312,7 +312,7 @@ def index_pats_distributed(
         ngpupro = min(max(6, ngpu*6), 12)  # number of processes that will serve data to the gpu
         #ngpupro = 8
         if n_cpu_nodes < 8:
-            ngpupro = min(ngpupro,8)
+            ngpupro = min(ngpupro, n_cpu_nodes)
         if n_cpu_nodes < 2:
             ngpupro = 2
         #if OSPLATFORM == 'Linux':
diff --git a/pyebsdindex/opencl/nlpar_cl.py b/pyebsdindex/opencl/nlpar_cl.py
index a3307a2..a367788 100644
--- a/pyebsdindex/opencl/nlpar_cl.py
+++ b/pyebsdindex/opencl/nlpar_cl.py
@@ -163,7 +163,7 @@ def calcsigma_cl(self,nn=1,saturation_protect=True,automask=True, normalize_d=Fa
     #print(gpu_id)
     clparams.get_context(gpu_id=gpu_id, kfile = 'clnlpar.cl')
     clparams.get_queue()
-    target_mem = clparams.queue.device.max_mem_alloc_size//2
+    target_mem = min(clparams.queue.device.max_mem_alloc_size//2, int(4e9))
     ctx = clparams.ctx
     prg = clparams.prg
     queue = clparams.queue
@@ -400,7 +400,7 @@ def calcnlpar_cl(self, searchradius=None, lam = None, dthresh = None, saturation
     #print(gpu_id)
     clparams.get_context(gpu_id=gpu_id, kfile ='clnlpar.cl')
     clparams.get_queue()
-    target_mem = clparams.queue.device.max_mem_alloc_size//2
+    target_mem = min(clparams.queue.device.max_mem_alloc_size//4, int(2e9))
     ctx = clparams.ctx
     prg = clparams.prg
     queue = clparams.queue

From aa5fbf22ecff2e277f3383212f096af3fce9174f Mon Sep 17 00:00:00 2001
From: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
Date: Mon, 10 Jun 2024 21:14:31 -0400
Subject: [PATCH 04/22] More tuning.

Signed-off by: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
---
 pyebsdindex/_ebsd_index_parallel.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pyebsdindex/_ebsd_index_parallel.py b/pyebsdindex/_ebsd_index_parallel.py
index cf4a64f..de2f968 100644
--- a/pyebsdindex/_ebsd_index_parallel.py
+++ b/pyebsdindex/_ebsd_index_parallel.py
@@ -309,7 +309,10 @@ def index_pats_distributed(
 
 
     if ngpu > 0:
-        ngpupro = min(max(6, ngpu*6), 12)  # number of processes that will serve data to the gpu
+        gpuratio = (12, ngpu*6)
+        if (platform.machine(), platform.system()) == ('x86_64', 'Darwin'):
+            gpuratio = (6, ngpu*6)
+        ngpupro = min(max(gpuratio), 18)  # number of processes that will serve data to the gpu
         #ngpupro = 8
         if n_cpu_nodes < 8:
             ngpupro = min(ngpupro, n_cpu_nodes)

From 4b949838c2bf7b19f10a28da6e3a81f455561268 Mon Sep 17 00:00:00 2001
From: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
Date: Mon, 10 Jun 2024 21:35:28 -0400
Subject: [PATCH 05/22] Check

Signed-off by: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
---
 pyebsdindex/_ebsd_index_parallel.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyebsdindex/_ebsd_index_parallel.py b/pyebsdindex/_ebsd_index_parallel.py
index de2f968..c67bc10 100644
--- a/pyebsdindex/_ebsd_index_parallel.py
+++ b/pyebsdindex/_ebsd_index_parallel.py
@@ -309,10 +309,10 @@ def index_pats_distributed(
 
 
     if ngpu > 0:
-        gpuratio = (12, ngpu*6)
+        gpuratio = (12, ngpu*4)
         if (platform.machine(), platform.system()) == ('x86_64', 'Darwin'):
             gpuratio = (6, ngpu*6)
-        ngpupro = min(max(gpuratio), 18)  # number of processes that will serve data to the gpu
+        ngpupro = min(max(gpuratio), 12)  # number of processes that will serve data to the gpu
         #ngpupro = 8
         if n_cpu_nodes < 8:
             ngpupro = min(ngpupro, n_cpu_nodes)

From 51433fbb49b19ef5f7829711d42158bc36d24113 Mon Sep 17 00:00:00 2001
From: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
Date: Mon, 10 Jun 2024 21:58:45 -0400
Subject: [PATCH 06/22] Attempt to fix CUDA not seeing GPUs in Ray

Signed-off by: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
---
 pyebsdindex/_ebsd_index_parallel.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyebsdindex/_ebsd_index_parallel.py b/pyebsdindex/_ebsd_index_parallel.py
index c67bc10..e744539 100644
--- a/pyebsdindex/_ebsd_index_parallel.py
+++ b/pyebsdindex/_ebsd_index_parallel.py
@@ -768,6 +768,7 @@ def __optimizegpuchunk__(indexer, ngpupro, gpu_id, clparam):
 @ray.remote(num_cpus=1, num_gpus=1)
 class GPUWorker:
     def __init__(self, actorid=0, clparammodule=None, gpu_id=None, cudavis = '0'):
+        del os.environ['CUDA_VISIBLE_DEVICES']
         # sys.path.append(path.dirname(path.dirname(__file__)))  # do this to help Ray find the program files
         # import openclparam # do this to help Ray find the program files
         # device, context, queue, program, mf

From 6ac0807e9fc2e50748106c58f61688498a1c928a Mon Sep 17 00:00:00 2001
From: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
Date: Mon, 10 Jun 2024 22:09:51 -0400
Subject: [PATCH 07/22] Check IP

Signed-off by: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
---
 pyebsdindex/_ebsd_index_parallel.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pyebsdindex/_ebsd_index_parallel.py b/pyebsdindex/_ebsd_index_parallel.py
index e744539..1027a87 100644
--- a/pyebsdindex/_ebsd_index_parallel.py
+++ b/pyebsdindex/_ebsd_index_parallel.py
@@ -47,7 +47,8 @@
 else:
     from pyebsdindex import band_detect as band_detect
 
-RAYIPADDRESS = '127.0.0.1'
+#RAYIPADDRESS = '127.0.0.1'
+RAYIPADDRESS = '0.0.0.0'
 OSPLATFORM  = platform.system()
 #if OSPLATFORM  == 'Darwin':
 #    RAYIPADDRESS = '0.0.0.0'  # the localhost address does not work on macOS when on a VPN

From a91df1caec48d51be49215425ab1c2cb0e0c172a Mon Sep 17 00:00:00 2001
From: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
Date: Mon, 10 Jun 2024 22:13:11 -0400
Subject: [PATCH 08/22] Revert

Signed-off by: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
---
 pyebsdindex/_ebsd_index_parallel.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyebsdindex/_ebsd_index_parallel.py b/pyebsdindex/_ebsd_index_parallel.py
index 1027a87..c9adb29 100644
--- a/pyebsdindex/_ebsd_index_parallel.py
+++ b/pyebsdindex/_ebsd_index_parallel.py
@@ -47,8 +47,8 @@
 else:
     from pyebsdindex import band_detect as band_detect
 
-#RAYIPADDRESS = '127.0.0.1'
-RAYIPADDRESS = '0.0.0.0'
+RAYIPADDRESS = '127.0.0.1'
+#RAYIPADDRESS = '0.0.0.0'
 OSPLATFORM  = platform.system()
 #if OSPLATFORM  == 'Darwin':
 #    RAYIPADDRESS = '0.0.0.0'  # the localhost address does not work on macOS when on a VPN

From e7e43186151631ed3ec656a8f198dcdac4c274a9 Mon Sep 17 00:00:00 2001
From: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
Date: Mon, 10 Jun 2024 22:16:15 -0400
Subject: [PATCH 09/22] Revert again.

Signed-off by: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
---
 pyebsdindex/_ebsd_index_parallel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyebsdindex/_ebsd_index_parallel.py b/pyebsdindex/_ebsd_index_parallel.py
index c9adb29..e534d4c 100644
--- a/pyebsdindex/_ebsd_index_parallel.py
+++ b/pyebsdindex/_ebsd_index_parallel.py
@@ -769,7 +769,7 @@ def __optimizegpuchunk__(indexer, ngpupro, gpu_id, clparam):
 @ray.remote(num_cpus=1, num_gpus=1)
 class GPUWorker:
     def __init__(self, actorid=0, clparammodule=None, gpu_id=None, cudavis = '0'):
-        del os.environ['CUDA_VISIBLE_DEVICES']
+        #del os.environ['CUDA_VISIBLE_DEVICES']
         # sys.path.append(path.dirname(path.dirname(__file__)))  # do this to help Ray find the program files
         # import openclparam # do this to help Ray find the program files
         # device, context, queue, program, mf

From 84ea980382c7bb9d9a7dceebd5662b2192cf767e Mon Sep 17 00:00:00 2001
From: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
Date: Thu, 20 Jun 2024 08:26:22 -0400
Subject: [PATCH 10/22] Put in initial buffer on GPU worker timeout in case of
 long JIT times.

Signed-off by: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
---
 pyebsdindex/_ebsd_index_parallel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyebsdindex/_ebsd_index_parallel.py b/pyebsdindex/_ebsd_index_parallel.py
index e534d4c..53c5924 100644
--- a/pyebsdindex/_ebsd_index_parallel.py
+++ b/pyebsdindex/_ebsd_index_parallel.py
@@ -462,7 +462,7 @@ def index_pats_distributed(
 
         #gpu_launched += 1
 
-    gpuwrker_cycles = 0
+    gpuwrker_cycles = -500
     cpuwrker_cycles = 0
 
     while ncpudone < njobs:

From b408b2b24221ef2bcc3dfe6dc8fd75da307cfec5 Mon Sep 17 00:00:00 2001
From: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
Date: Thu, 20 Jun 2024 09:48:07 -0400
Subject: [PATCH 11/22] Removed use of np.compat.long with np.long64

Signed-off by: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
---
 pyebsdindex/_ebsd_index_parallel.py  | 2 +-
 pyebsdindex/band_detect.py           | 2 +-
 pyebsdindex/opencl/band_detect_cl.py | 2 +-
 pyebsdindex/tripletvote.py           | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pyebsdindex/_ebsd_index_parallel.py b/pyebsdindex/_ebsd_index_parallel.py
index 53c5924..0f71425 100644
--- a/pyebsdindex/_ebsd_index_parallel.py
+++ b/pyebsdindex/_ebsd_index_parallel.py
@@ -373,7 +373,7 @@ def index_pats_distributed(
     # fall back to CPU only calculation.
     clparamfunction = band_detect.getopenclparam
     # Set up the jobs
-    njobs = (np.ceil(npats / chunksize)).astype(np.compat.long)
+    njobs = (np.ceil(npats / chunksize)).astype(np.long64)
 
     p_indx_start_end = [
         [i * chunksize + patstart, (i + 1) * chunksize + patstart, chunksize]
diff --git a/pyebsdindex/band_detect.py b/pyebsdindex/band_detect.py
index af5abb9..1f222fa 100644
--- a/pyebsdindex/band_detect.py
+++ b/pyebsdindex/band_detect.py
@@ -396,7 +396,7 @@ def find_bands(self, patternsIn, verbose=0, chunksize=-1,  **kwargs):
       chunksize = nPats
       chunk_start_end = [[0,nPats]]
     else:
-      nchunks = (np.ceil(nPats / chunksize)).astype(np.compat.long)
+      nchunks = (np.ceil(nPats / chunksize)).astype(np.long64)
       chunk_start_end = [[i * chunksize, (i + 1) * chunksize] for i in range(nchunks)]
       chunk_start_end[-1][1] = nPats
 
diff --git a/pyebsdindex/opencl/band_detect_cl.py b/pyebsdindex/opencl/band_detect_cl.py
index 77f79a2..9e271cf 100644
--- a/pyebsdindex/opencl/band_detect_cl.py
+++ b/pyebsdindex/opencl/band_detect_cl.py
@@ -87,7 +87,7 @@ def find_bands(self, patternsIn, verbose=0, clparams=None, chunksize=528, useCPU
         nchunks = 1
         chunksize = nPats
       else:
-        nchunks = (np.ceil(nPats / chunksize)).astype(np.compat.long)
+        nchunks = (np.ceil(nPats / chunksize)).astype(np.long64)
 
       chunk_start_end = [[i * chunksize,(i + 1) * chunksize] for i in range(nchunks)]
       chunk_start_end[-1][1] = nPats
diff --git a/pyebsdindex/tripletvote.py b/pyebsdindex/tripletvote.py
index 67fcbd1..420b17b 100644
--- a/pyebsdindex/tripletvote.py
+++ b/pyebsdindex/tripletvote.py
@@ -407,7 +407,7 @@ def build_trip_lib(self):
     #print(indx0FID)
     #This completely over previsions the arrays, this is essentially 
     #N Choose K with N = number of angles and K = 3
-    nlib = npoles*np.prod(np.arange(3, dtype=np.int64)+(nangs-2+1))/np.compat.long(math.factorial(3))
+    nlib = npoles*np.prod(np.arange(3, dtype=np.int64)+(nangs-2+1))/np.long64(math.factorial(3))
     nlib = nlib.astype(int)
 
     libANG = np.zeros((nlib, 3))

From adb0fe9c413a317a4d78756da32167c1ff1f350d Mon Sep 17 00:00:00 2001
From: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
Date: Thu, 20 Jun 2024 09:59:49 -0400
Subject: [PATCH 12/22] Remembering that numpy uses int64, not long64.

Signed-off by: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
---
 pyebsdindex/_ebsd_index_parallel.py  | 2 +-
 pyebsdindex/band_detect.py           | 2 +-
 pyebsdindex/opencl/band_detect_cl.py | 2 +-
 pyebsdindex/tripletvote.py           | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pyebsdindex/_ebsd_index_parallel.py b/pyebsdindex/_ebsd_index_parallel.py
index 0f71425..34e6798 100644
--- a/pyebsdindex/_ebsd_index_parallel.py
+++ b/pyebsdindex/_ebsd_index_parallel.py
@@ -373,7 +373,7 @@ def index_pats_distributed(
     # fall back to CPU only calculation.
     clparamfunction = band_detect.getopenclparam
     # Set up the jobs
-    njobs = (np.ceil(npats / chunksize)).astype(np.long64)
+    njobs = (np.ceil(npats / chunksize)).astype(np.int64)
 
     p_indx_start_end = [
         [i * chunksize + patstart, (i + 1) * chunksize + patstart, chunksize]
diff --git a/pyebsdindex/band_detect.py b/pyebsdindex/band_detect.py
index 1f222fa..f1b222a 100644
--- a/pyebsdindex/band_detect.py
+++ b/pyebsdindex/band_detect.py
@@ -396,7 +396,7 @@ def find_bands(self, patternsIn, verbose=0, chunksize=-1,  **kwargs):
       chunksize = nPats
       chunk_start_end = [[0,nPats]]
     else:
-      nchunks = (np.ceil(nPats / chunksize)).astype(np.long64)
+      nchunks = (np.ceil(nPats / chunksize)).astype(np.int64)
       chunk_start_end = [[i * chunksize, (i + 1) * chunksize] for i in range(nchunks)]
       chunk_start_end[-1][1] = nPats
 
diff --git a/pyebsdindex/opencl/band_detect_cl.py b/pyebsdindex/opencl/band_detect_cl.py
index 9e271cf..daae977 100644
--- a/pyebsdindex/opencl/band_detect_cl.py
+++ b/pyebsdindex/opencl/band_detect_cl.py
@@ -87,7 +87,7 @@ def find_bands(self, patternsIn, verbose=0, clparams=None, chunksize=528, useCPU
         nchunks = 1
         chunksize = nPats
       else:
-        nchunks = (np.ceil(nPats / chunksize)).astype(np.long64)
+        nchunks = (np.ceil(nPats / chunksize)).astype(np.int64)
 
       chunk_start_end = [[i * chunksize,(i + 1) * chunksize] for i in range(nchunks)]
       chunk_start_end[-1][1] = nPats
diff --git a/pyebsdindex/tripletvote.py b/pyebsdindex/tripletvote.py
index 420b17b..4280823 100644
--- a/pyebsdindex/tripletvote.py
+++ b/pyebsdindex/tripletvote.py
@@ -407,7 +407,7 @@ def build_trip_lib(self):
     #print(indx0FID)
     #This completely over previsions the arrays, this is essentially 
     #N Choose K with N = number of angles and K = 3
-    nlib = npoles*np.prod(np.arange(3, dtype=np.int64)+(nangs-2+1))/np.long64(math.factorial(3))
+    nlib = npoles*np.prod(np.arange(3, dtype=np.int64)+(nangs-2+1))/np.int64(math.factorial(3))
     nlib = nlib.astype(int)
 
     libANG = np.zeros((nlib, 3))

From 554e5fae1870700059672eae97463a4305a1b315 Mon Sep 17 00:00:00 2001
From: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
Date: Thu, 20 Jun 2024 11:26:45 -0400
Subject: [PATCH 13/22] Validate that all indices are ints.

Signed-off by: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
---
 pyebsdindex/radon_fast.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/pyebsdindex/radon_fast.py b/pyebsdindex/radon_fast.py
index 7ce1534..77de160 100644
--- a/pyebsdindex/radon_fast.py
+++ b/pyebsdindex/radon_fast.py
@@ -134,9 +134,9 @@ def radon_plan_setup(self, image=None, imageDim=None, nTheta=None, nRho=None, rh
         #else:
           #indx_x = np.ceil(a[i] * n + b1).astype(np.int64)
         indx_x = np.round(a[i] * n + b1).astype(np.int64)
-        indx_x = np.where(indx_x < 0, outofbounds, indx_x)
-        indx_x = np.where(indx_x >= self.imDim[1], outofbounds, indx_x)
-        indx1D = np.clip(indx_x+self.imDim[1]*n, 0, outofbounds)
+        indx_x = np.where(indx_x < 0, outofbounds, indx_x).astype(np.int64)
+        indx_x = np.where(indx_x >= self.imDim[1], outofbounds, indx_x).astype(np.int64)
+        indx1D = np.clip(indx_x+self.imDim[1]*n, 0, outofbounds).astype(np.int64)
         # for j in range(self.nRho):
         #   indx_good = indx1D[j,:].flatten()
         #   whgood = np.nonzero(indx_good < outofbounds)[0]
@@ -151,10 +151,10 @@ def radon_plan_setup(self, image=None, imageDim=None, nTheta=None, nRho=None, rh
         #       indx1D[j, 0:whmask.size] = newindex[whmask]
 
         self.indexPlan[:, i, 0:self.imDim[0]] = indx1D
-    tempindx = self.indexPlan.flatten()
-    mask = np.concatenate( (self.mask.flatten(), np.array([0,0])))
+    tempindx = self.indexPlan.flatten().astype(np.int64)
+    mask = np.concatenate( (self.mask.flatten().astype(np.int64), np.array([0,0], dtype=np.int64)))
     tempindx = np.where(mask[tempindx] > 0, tempindx, outofbounds)
-    maskindex = np.concatenate((self.maskindex.flatten(), np.array([-1,-1])))
+    maskindex = np.concatenate((self.maskindex.flatten(), np.array([-1,-1]))).astype(np.int64)
     tempindx = np.where(maskindex[tempindx] >= 0, maskindex[tempindx], outofbounds)
     self.indexPlan = tempindx.reshape([self.nRho,self.nTheta,self.imDim.max()])
     self.indexPlan.sort(axis = -1)

From c54f183b7a12f2974741b9e2650c0ffeb5000d9d Mon Sep 17 00:00:00 2001
From: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
Date: Thu, 20 Jun 2024 15:46:21 -0400
Subject: [PATCH 14/22] More numpy 2.0 cleanup

Signed-off by: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
---
 pyebsdindex/pcopt.py      | 4 ++--
 pyebsdindex/radon_fast.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pyebsdindex/pcopt.py b/pyebsdindex/pcopt.py
index 009f7b7..2320e19 100644
--- a/pyebsdindex/pcopt.py
+++ b/pyebsdindex/pcopt.py
@@ -435,9 +435,9 @@ def initializeswarm(self, start=None, bounds=None):
         self.vellimit = 4*np.mean(np.sqrt(np.sum(self.vel**2, axis=1)))
 
 
-        self.pbest = np.zeros(self.n_particles) + np.infty
+        self.pbest = np.zeros(self.n_particles) + np.inf
         self.pbest_loc = np.copy(self.pos)
-        self.gbest = np.infty
+        self.gbest = np.inf
         self.gbest_loc = start
 
 
diff --git a/pyebsdindex/radon_fast.py b/pyebsdindex/radon_fast.py
index 77de160..300d4fc 100644
--- a/pyebsdindex/radon_fast.py
+++ b/pyebsdindex/radon_fast.py
@@ -331,7 +331,7 @@ def radon2pole(self,bandData,PC=None,vendor='EDAX'):
     stheta = np.sin(theta)
     ctheta = np.cos(theta)
 
-    pctemp =  np.asfarray(PC).copy()
+    pctemp =  np.asarray(PC, dtype=np.float32).copy()
     shapet = pctemp.shape
     if ven != 'EMSOFT':
       if len(shapet) < 2:

From 6bb3bac8a4f315d3f12cc5fed32f0b30617420e3 Mon Sep 17 00:00:00 2001
From: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
Date: Fri, 28 Jun 2024 13:54:09 -0400
Subject: [PATCH 15/22] And more numpy 2.0 changes

Signed-off by: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
---
 pyebsdindex/opencl/band_detect_cl.py | 2 +-
 pyebsdindex/opencl/radon_fast_cl.py  | 2 +-
 pyebsdindex/tripletvote.py           | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pyebsdindex/opencl/band_detect_cl.py b/pyebsdindex/opencl/band_detect_cl.py
index daae977..a2305da 100644
--- a/pyebsdindex/opencl/band_detect_cl.py
+++ b/pyebsdindex/opencl/band_detect_cl.py
@@ -270,7 +270,7 @@ def radon_fasterCL(self,image,padding = np.array([0,0]), fixArtifacts = False, b
     #radon_gpu = cl.Buffer(ctx,mf.READ_WRITE,size=radon.nbytes)
     #radon_gpu = cl.Buffer(ctx,mf.READ_WRITE | mf.COPY_HOST_PTR,hostbuf=radon)
     image_gpu = cl.Buffer(ctx,mf.READ_ONLY | mf.COPY_HOST_PTR,hostbuf=image)
-    imstep = np.uint64(np.product(shapeIm[-2:]))
+    imstep = np.uint64(np.prod(shapeIm[-2:]))
     tic = timer()
 
     nImChunk = np.uint64(nImCL/clvtypesize)
diff --git a/pyebsdindex/opencl/radon_fast_cl.py b/pyebsdindex/opencl/radon_fast_cl.py
index 5fc8b75..8089879 100644
--- a/pyebsdindex/opencl/radon_fast_cl.py
+++ b/pyebsdindex/opencl/radon_fast_cl.py
@@ -98,7 +98,7 @@ def radon_fasterCL(self,image,padding = np.array([0,0]), fixArtifacts = False,
     image_gpu = cl.Buffer(ctx,mf.READ_ONLY | mf.COPY_HOST_PTR,hostbuf=image_align)
     rdnIndx_gpu = cl.Buffer(ctx,mf.READ_ONLY | mf.COPY_HOST_PTR,hostbuf=self.indexPlan)
 
-    imstep = np.uint64(np.product(shapeIm[-2:]))
+    imstep = np.uint64(np.prod(shapeIm[-2:]))
     indxstep = np.uint64(self.indexPlan.shape[-1])
     rdnstep = np.uint64(self.nRho * self.nTheta)
 
diff --git a/pyebsdindex/tripletvote.py b/pyebsdindex/tripletvote.py
index 4280823..4910b17 100644
--- a/pyebsdindex/tripletvote.py
+++ b/pyebsdindex/tripletvote.py
@@ -792,7 +792,7 @@ def _refine_orientation(self, bandnorms, whGood, polematch):
     tic = timer()
     poles = self.tripLib.completelib['polesCart']
     nGood = whGood.size
-    n2Fit = np.int64(np.product(np.arange(2)+(nGood-2+1))/np.int64(2))
+    n2Fit = np.int64(np.prod(np.arange(2)+(nGood-2+1))/np.int64(2))
     whGood = np.asarray(whGood,dtype=np.int64)
     #AB, ABgood = self.orientation_refine_loops_am(nGood,whGood,poles,bandnorms,polematch,n2Fit)
     # tic = timer()

From 92129f9e6698db18bbe230f810ab14d65ebfb38f Mon Sep 17 00:00:00 2001
From: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
Date: Fri, 28 Jun 2024 14:11:02 -0400
Subject: [PATCH 16/22] Fixed typo

Signed-off by: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
---
 pyebsdindex/opencl/clkernels.cl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyebsdindex/opencl/clkernels.cl b/pyebsdindex/opencl/clkernels.cl
index b712d2f..b045f90 100644
--- a/pyebsdindex/opencl/clkernels.cl
+++ b/pyebsdindex/opencl/clkernels.cl
@@ -70,7 +70,7 @@ __kernel void loaduint16( const __global ushort *im1, __global float *im1flt, co
 
 
 // simple program to convert a float to float and transpose array
-__kernel void loaduufloat32( const __global float *im1, __global float *im1flt, const unsigned long int nImCL)
+__kernel void loadfloat32( const __global float *im1, __global float *im1flt, const unsigned long int nImCL)
   {
   const unsigned long int x = get_global_id(0);
   const unsigned long int y = get_global_id(1);

From ec3921fc8695f6ef33ae680418802ab409ece5fc Mon Sep 17 00:00:00 2001
From: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
Date: Fri, 28 Jun 2024 14:43:18 -0400
Subject: [PATCH 17/22] Type corrections

Signed-off by: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
---
 pyebsdindex/opencl/band_detect_cl.py | 2 +-
 pyebsdindex/opencl/radon_fast_cl.py  | 2 +-
 pyebsdindex/tripletvote.py           | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pyebsdindex/opencl/band_detect_cl.py b/pyebsdindex/opencl/band_detect_cl.py
index a2305da..8830f97 100644
--- a/pyebsdindex/opencl/band_detect_cl.py
+++ b/pyebsdindex/opencl/band_detect_cl.py
@@ -270,7 +270,7 @@ def radon_fasterCL(self,image,padding = np.array([0,0]), fixArtifacts = False, b
     #radon_gpu = cl.Buffer(ctx,mf.READ_WRITE,size=radon.nbytes)
     #radon_gpu = cl.Buffer(ctx,mf.READ_WRITE | mf.COPY_HOST_PTR,hostbuf=radon)
     image_gpu = cl.Buffer(ctx,mf.READ_ONLY | mf.COPY_HOST_PTR,hostbuf=image)
-    imstep = np.uint64(np.prod(shapeIm[-2:]))
+    imstep = np.uint64(np.prod(shapeIm[-2:], dtype=int))
     tic = timer()
 
     nImChunk = np.uint64(nImCL/clvtypesize)
diff --git a/pyebsdindex/opencl/radon_fast_cl.py b/pyebsdindex/opencl/radon_fast_cl.py
index 8089879..fd8b3b9 100644
--- a/pyebsdindex/opencl/radon_fast_cl.py
+++ b/pyebsdindex/opencl/radon_fast_cl.py
@@ -98,7 +98,7 @@ def radon_fasterCL(self,image,padding = np.array([0,0]), fixArtifacts = False,
     image_gpu = cl.Buffer(ctx,mf.READ_ONLY | mf.COPY_HOST_PTR,hostbuf=image_align)
     rdnIndx_gpu = cl.Buffer(ctx,mf.READ_ONLY | mf.COPY_HOST_PTR,hostbuf=self.indexPlan)
 
-    imstep = np.uint64(np.prod(shapeIm[-2:]))
+    imstep = np.uint64(np.prod(shapeIm[-2:], dtype=int))
     indxstep = np.uint64(self.indexPlan.shape[-1])
     rdnstep = np.uint64(self.nRho * self.nTheta)
 
diff --git a/pyebsdindex/tripletvote.py b/pyebsdindex/tripletvote.py
index 4910b17..26dd026 100644
--- a/pyebsdindex/tripletvote.py
+++ b/pyebsdindex/tripletvote.py
@@ -407,7 +407,7 @@ def build_trip_lib(self):
     #print(indx0FID)
     #This completely over previsions the arrays, this is essentially 
     #N Choose K with N = number of angles and K = 3
-    nlib = npoles*np.prod(np.arange(3, dtype=np.int64)+(nangs-2+1))/np.int64(math.factorial(3))
+    nlib = int(npoles*np.prod(np.arange(3, dtype=np.int64)+(nangs-2+1))//np.int64(math.factorial(3)))
     nlib = nlib.astype(int)
 
     libANG = np.zeros((nlib, 3))
@@ -792,7 +792,7 @@ def _refine_orientation(self, bandnorms, whGood, polematch):
     tic = timer()
     poles = self.tripLib.completelib['polesCart']
     nGood = whGood.size
-    n2Fit = np.int64(np.prod(np.arange(2)+(nGood-2+1))/np.int64(2))
+    n2Fit = np.int64(np.prod(np.arange(2)+(nGood-2+1), dtype=int)//np.int64(2))
     whGood = np.asarray(whGood,dtype=np.int64)
     #AB, ABgood = self.orientation_refine_loops_am(nGood,whGood,poles,bandnorms,polematch,n2Fit)
     # tic = timer()

From 065ce7e1de69526ee7b3d7b66ca5d6a8abf450ff Mon Sep 17 00:00:00 2001
From: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
Date: Fri, 28 Jun 2024 14:46:19 -0400
Subject: [PATCH 18/22] Type corrections

Signed-off by: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
---
 pyebsdindex/tripletvote.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyebsdindex/tripletvote.py b/pyebsdindex/tripletvote.py
index 26dd026..1b39eb4 100644
--- a/pyebsdindex/tripletvote.py
+++ b/pyebsdindex/tripletvote.py
@@ -407,7 +407,7 @@ def build_trip_lib(self):
     #print(indx0FID)
     #This completely over previsions the arrays, this is essentially 
     #N Choose K with N = number of angles and K = 3
-    nlib = int(npoles*np.prod(np.arange(3, dtype=np.int64)+(nangs-2+1))//np.int64(math.factorial(3)))
+    nlib = npoles*np.prod(np.arange(3, dtype=np.int64)+(nangs-2+1))//np.int64(math.factorial(3))
     nlib = nlib.astype(int)
 
     libANG = np.zeros((nlib, 3))

From a44b40664a473bbcc39665a30d309e563fd815e1 Mon Sep 17 00:00:00 2001
From: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
Date: Fri, 28 Jun 2024 14:52:23 -0400
Subject: [PATCH 19/22] Another type fix

Signed-off by: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
---
 pyebsdindex/ebsd_pattern.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyebsdindex/ebsd_pattern.py b/pyebsdindex/ebsd_pattern.py
index 54b84a5..74e04d1 100644
--- a/pyebsdindex/ebsd_pattern.py
+++ b/pyebsdindex/ebsd_pattern.py
@@ -502,7 +502,7 @@ def pat_reader(self, patStart=0, nPatToRead=1):
     typeread = self.filedatatype
     typebyte = self.filedatatype(0).nbytes
 
-    f.seek(int(nPerPat * patStart * typebyte),1)
+    f.seek(int(np.int64(nPerPat) * np.int64(patStart) * typebyte),1)
     readpats = np.fromfile(f,dtype=typeread,count=int(nPatToRead * nPerPat))
     readpats = readpats.reshape(nPatToRead,self.patternH,self.patternW)
     f.close()

From 25e9cd0338340318e4d038f8f20e28dfeb4fb2e0 Mon Sep 17 00:00:00 2001
From: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
Date: Fri, 28 Jun 2024 15:23:47 -0400
Subject: [PATCH 20/22] Signed-off by: David Rowenhorst
 <david.rowenhorst@nrl.navy.mil>

---
 pyebsdindex/ebsd_pattern.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/pyebsdindex/ebsd_pattern.py b/pyebsdindex/ebsd_pattern.py
index 74e04d1..bbe496c 100644
--- a/pyebsdindex/ebsd_pattern.py
+++ b/pyebsdindex/ebsd_pattern.py
@@ -502,8 +502,13 @@ def pat_reader(self, patStart=0, nPatToRead=1):
     typeread = self.filedatatype
     typebyte = self.filedatatype(0).nbytes
 
+<<<<<<< Updated upstream
     f.seek(int(np.int64(nPerPat) * np.int64(patStart) * typebyte),1)
     readpats = np.fromfile(f,dtype=typeread,count=int(nPatToRead * nPerPat))
+=======
+    f.seek(int(nPerPat * patStart * typebyte),1)
+    readpats = np.fromfile(f,dtype=typeread,count=np.int64(np.int64(nPatToRead) * np.int64(nPerPat)))
+>>>>>>> Stashed changes
     readpats = readpats.reshape(nPatToRead,self.patternH,self.patternW)
     f.close()
     yx = np.unravel_index(np.arange(int(patStart), int(patStart+nPatToRead), dtype = np.uint64),

From eba5f0a39eecc65e06a5dd024c4070a9a3fe4603 Mon Sep 17 00:00:00 2001
From: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
Date: Fri, 28 Jun 2024 15:24:58 -0400
Subject: [PATCH 21/22] More corrections

Signed-off by: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
---
 pyebsdindex/ebsd_pattern.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/pyebsdindex/ebsd_pattern.py b/pyebsdindex/ebsd_pattern.py
index bbe496c..a8bf20c 100644
--- a/pyebsdindex/ebsd_pattern.py
+++ b/pyebsdindex/ebsd_pattern.py
@@ -502,13 +502,10 @@ def pat_reader(self, patStart=0, nPatToRead=1):
     typeread = self.filedatatype
     typebyte = self.filedatatype(0).nbytes
 
-<<<<<<< Updated upstream
+
     f.seek(int(np.int64(nPerPat) * np.int64(patStart) * typebyte),1)
-    readpats = np.fromfile(f,dtype=typeread,count=int(nPatToRead * nPerPat))
-=======
-    f.seek(int(nPerPat * patStart * typebyte),1)
     readpats = np.fromfile(f,dtype=typeread,count=np.int64(np.int64(nPatToRead) * np.int64(nPerPat)))
->>>>>>> Stashed changes
+
     readpats = readpats.reshape(nPatToRead,self.patternH,self.patternW)
     f.close()
     yx = np.unravel_index(np.arange(int(patStart), int(patStart+nPatToRead), dtype = np.uint64),

From 53ba837da48c986a4a207fe2705cacf259577a91 Mon Sep 17 00:00:00 2001
From: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
Date: Fri, 28 Jun 2024 17:59:48 -0400
Subject: [PATCH 22/22] Prepare for release.

Signed-off by: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
---
 CHANGELOG.rst                  | 10 ++++++++++
 pyebsdindex/__init__.py        |  2 +-
 pyebsdindex/opencl/nlpar_cl.py |  2 +-
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index a5a0d15..62acd3c 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -5,6 +5,16 @@ Changelog
 All notable changes to PyEBSDIndex will be documented in this file. The format is based
 on `Keep a Changelog <https://keepachangelog.com/en/1.1.0>`_.
 
+0.3.5 (2024-06-07)
+==================
+
+Fixed
+-----
+- Further tweaking of NLPAR GPU memory limits for Apple-ARM.
+- Many small type fixes for numpy 2.0 compatibillty.
+- Corrected GPU detection for distributed indexing.
+- Fixed issue where slower machines would erroneously detect a GPU timeout.
+
 
 0.3.4 (2024-06-07)
 ==================
diff --git a/pyebsdindex/__init__.py b/pyebsdindex/__init__.py
index 53da16a..b8a23bf 100644
--- a/pyebsdindex/__init__.py
+++ b/pyebsdindex/__init__.py
@@ -7,7 +7,7 @@
 ]
 __description__ = "Python based tool for Radon based EBSD indexing"
 __name__ = "pyebsdindex"
-__version__ = "0.3.4"
+__version__ = "0.3.5"
 
 
 # Try to import only once - also will perform check that at least one GPU is found.
diff --git a/pyebsdindex/opencl/nlpar_cl.py b/pyebsdindex/opencl/nlpar_cl.py
index a367788..28a239c 100644
--- a/pyebsdindex/opencl/nlpar_cl.py
+++ b/pyebsdindex/opencl/nlpar_cl.py
@@ -163,7 +163,7 @@ def calcsigma_cl(self,nn=1,saturation_protect=True,automask=True, normalize_d=Fa
     #print(gpu_id)
     clparams.get_context(gpu_id=gpu_id, kfile = 'clnlpar.cl')
     clparams.get_queue()
-    target_mem = min(clparams.queue.device.max_mem_alloc_size//2, int(4e9))
+    target_mem = min(clparams.queue.device.max_mem_alloc_size//2, np.int64(4e9))
     ctx = clparams.ctx
     prg = clparams.prg
     queue = clparams.queue