Hack around the local memory problem.

diku-dk · Sep 18, 2020 · 1fd5c0f · 1fd5c0f
1 parent f6a7f10
commit 1fd5c0f
Show file tree

Hide file tree

Showing 3 changed files with 15 additions and 3 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -49,6 +49,9 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 
   * Missing synchronisation for in-place updates at group level.
 
+  * Fixed (in a hacky way) an issue where `reduce_by_index` would use
+    too much local memory on AMD GPUs when using the OpenCL backend.
+
 ## [0.16.4]
 
 ### Added

diff --git a/rts/c/opencl.h b/rts/c/opencl.h
@@ -556,11 +556,18 @@ static cl_program setup_opencl_with_command_queue(struct opencl_context *ctx,
   // Futhark reserves 4 bytes for bookkeeping information.
   max_local_memory -= 4;
 
-  // NVIDIA reserves some more bytes for who-knows-what.  The number
-  // of bytes here has been experimentally determined, but the
-  // overhead seems to vary a bit depending on what the kernel does.
+  // The OpenCL implementation may reserve some local memory bytes for
+  // various purposes.  In principle, we should use
+  // clGetKernelWorkGroupInfo() to figure out for each kernel how much
+  // is actually available, but our current code generator design
+  // makes this infeasible.  Instead, we have this nasty hack where we
+  // arbitrarily subtract some bytes, based on empirical measurements
+  // (but which might be arbitrarily wrong).  Fortunately, we rarely
+  // try to really push the local memory usage.
   if (strstr(device_option.platform_name, "NVIDIA CUDA") != NULL) {
     max_local_memory -= 12;
+  } else if (strstr(device_option.platform_name, "AMD") != NULL) {
+    max_local_memory -= 16;
   }
 
   // Make sure this function is defined.

diff --git a/rts/python/opencl.py b/rts/python/opencl.py
@@ -115,6 +115,8 @@ def initialise_opencl_object(self,
     # See comment in rts/c/opencl.h.
     if self.platform.name.find('NVIDIA CUDA') >= 0:
         self.max_local_memory -= 12
+    elif self.platform.name.find('AMD') >= 0:
+        self.max_local_memory -= 16
 
     self.free_list = {}