Permit querying of cache size and number of registers.

Use this in SegHist codegen.
diku-dk · Dec 6, 2023 · 6f15eba · 6f15eba
1 parent 2abaf42
commit 6f15eba
Show file tree

Hide file tree

Showing 5 changed files with 48 additions and 9 deletions.
diff --git a/rts/c/backends/cuda.h b/rts/c/backends/cuda.h
@@ -289,6 +289,7 @@ struct futhark_context {
   size_t max_threshold;
   size_t max_local_memory;
   size_t max_bespoke;
+  size_t max_cache;
 
   size_t lockstep_width;
 
@@ -457,6 +458,12 @@ static void cuda_nvrtc_mk_build_options(struct futhark_context *ctx, const char
   opts[i++] = msgprintf("-D%s=%d",
                         "max_group_size",
                         (int)ctx->max_group_size);
+  opts[i++] = msgprintf("-D%s=%d",
+                        "max_local_memory",
+                        (int)ctx->max_local_memory);
+  opts[i++] = msgprintf("-D%s=%d",
+                        "max_registers",
+                        (int)ctx->max_registers);
   for (int j = 0; j < cfg->num_tuning_params; j++) {
     opts[i++] = msgprintf("-D%s=%zu", cfg->tuning_param_vars[j],
                           cfg->tuning_params[j]);
@@ -804,6 +811,8 @@ int backend_context_setup(struct futhark_context* ctx) {
   ctx->max_tile_size = sqrt(ctx->max_group_size);
   ctx->max_threshold = 0;
   ctx->max_bespoke = 0;
+  ctx->max_registers = device_query(ctx->dev, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK);
+  ctx->max_cache = device_query(ctx->dev, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE);
   ctx->lockstep_width = device_query(ctx->dev, WARP_SIZE);
   CUDA_SUCCEED_FATAL(cuStreamCreate(&ctx->stream, CU_STREAM_DEFAULT));
   cuda_size_setup(ctx);

diff --git a/rts/c/backends/hip.h b/rts/c/backends/hip.h
@@ -263,6 +263,8 @@ struct futhark_context {
   size_t max_threshold;
   size_t max_local_memory;
   size_t max_bespoke;
+  size_t max_registers;
+  size_t max_cache;
 
   size_t lockstep_width;
 
@@ -473,6 +475,12 @@ static void hiprtc_mk_build_options(struct futhark_context *ctx, const char *ext
   opts[i++] = msgprintf("-D%s=%d",
                         "max_group_size",
                         (int)ctx->max_group_size);
+  opts[i++] = msgprintf("-D%s=%d",
+                        "max_local_memory",
+                        (int)ctx->max_local_memory);
+  opts[i++] = msgprintf("-D%s=%d",
+                        "max_registers",
+                        (int)ctx->max_registers);
   for (int j = 0; j < cfg->num_tuning_params; j++) {
     opts[i++] = msgprintf("-D%s=%zu", cfg->tuning_param_vars[j],
                           cfg->tuning_params[j]);
@@ -659,6 +667,8 @@ int backend_context_setup(struct futhark_context* ctx) {
   ctx->max_tile_size = sqrt(ctx->max_group_size);
   ctx->max_threshold = 0;
   ctx->max_bespoke = 0;
+  ctx->max_registers = device_query(ctx->dev, hipDeviceAttributeMaxRegistersPerBlock);
+  ctx->max_cache = device_query(ctx->dev, hipDeviceAttributeL2CacheSize);
   // FIXME: in principle we should query hipDeviceAttributeWarpSize
   // from the device, which will provide 64 on AMD GPUs.
   // Unfortunately, we currently do nasty implicit intra-warp

diff --git a/rts/c/backends/opencl.h b/rts/c/backends/opencl.h
@@ -528,6 +528,8 @@ struct futhark_context {
   size_t max_tile_size;
   size_t max_threshold;
   size_t max_local_memory;
+  size_t max_registers;
+  size_t max_cache;
 
   size_t lockstep_width;
 
@@ -594,6 +596,16 @@ static char* mk_compile_opts(struct futhark_context *ctx,
                 "max_group_size",
                 (int)ctx->max_group_size);
 
+  w += snprintf(compile_opts+w, compile_opts_size-w,
+                "-D%s=%d ",
+                "max_local_memory",
+                (int)ctx->max_local_memory);
+
+  w += snprintf(compile_opts+w, compile_opts_size-w,
+                "-D%s=%d ",
+                "max_registers",
+                (int)ctx->max_registers);
+
   for (int i = 0; i < ctx->cfg->num_tuning_params; i++) {
     w += snprintf(compile_opts+w, compile_opts_size-w,
                   "-D%s=%d ",
@@ -785,6 +797,14 @@ static void setup_opencl_with_command_queue(struct futhark_context *ctx,
     ctx->cfg->default_tile_size = max_tile_size;
   }
 
+
+  cl_ulong cache_size;
+  OPENCL_SUCCEED_FATAL(clGetDeviceInfo(device_option.device, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE,
+                                       sizeof(cache_size), &cache_size, NULL));
+  ctx->max_cache = cache_size;
+
+  ctx->max_registers = 1<<16; // I cannot find a way to query for this.
+
   ctx->max_group_size = max_group_size;
   ctx->max_tile_size = max_tile_size; // No limit.
   ctx->max_threshold = ctx->max_num_groups = 0; // No limit.

diff --git a/src/Futhark/CodeGen/ImpGen/GPU/SegHist.hs b/src/Futhark/CodeGen/ImpGen/GPU/SegHist.hs
@@ -243,17 +243,9 @@ prepareIntermediateArraysGlobal passage segments hist_T hist_N slugs = do
           t64 $
             r64 hist_T / hist_C_max
 
-  -- Querying L2 cache size is not reliable.  Instead we provide a
-  -- tunable knob with a hopefully sane default.
-  let hist_L2_def = 4 * 1024 * 1024
   hist_L2 <- dPrim "L2_size" int32
-  entry <- askFunction
   -- Equivalent to F_L2*L2 in paper.
-  sOp
-    $ Imp.GetSize
-      (tvVar hist_L2)
-      (keyWithEntryPoint entry $ nameFromString (prettyString (tvVar hist_L2)))
-    $ Imp.SizeBespoke (nameFromString "L2_for_histogram") hist_L2_def
+  sOp $ Imp.GetSizeMax (tvVar hist_L2) $ Imp.SizeCache
 
   let hist_L2_ln_sz = 16 * 4 -- L2 cache line size approximation
   hist_RACE_exp <-

diff --git a/src/Futhark/IR/GPU/Sizes.hs b/src/Futhark/IR/GPU/Sizes.hs
@@ -38,6 +38,12 @@ data SizeClass
     SizeLocalMemory
   | -- | A bespoke size with a default.
     SizeBespoke Name Int64
+  | -- | Amount of registers available per workgroup. Mostly
+    -- meaningful for querying the maximum.
+    SizeRegisters
+  | -- | Amount of L2 cache memory, in bytes. Mostly meaningful for
+    -- querying the maximum.
+    SizeCache
   deriving (Eq, Ord, Show)
 
 instance Pretty SizeClass where
@@ -54,6 +60,8 @@ instance Pretty SizeClass where
   pretty SizeLocalMemory = "local_memory"
   pretty (SizeBespoke k def) =
     "bespoke" <> parens (pretty k <> comma <+> pretty def)
+  pretty SizeRegisters = "registers"
+  pretty SizeCache = "cache"
 
 -- | The default value for the size.  If 'Nothing', that means the backend gets to decide.
 sizeDefault :: SizeClass -> Maybe Int64