Skip to content

Commit

Permalink
Permit querying of cache size and number of registers.
Browse files Browse the repository at this point in the history
Use this in SegHist codegen.
  • Loading branch information
athas committed Dec 6, 2023
1 parent 2abaf42 commit 6f15eba
Show file tree
Hide file tree
Showing 5 changed files with 48 additions and 9 deletions.
9 changes: 9 additions & 0 deletions rts/c/backends/cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,7 @@ struct futhark_context {
size_t max_threshold;
size_t max_local_memory;
size_t max_bespoke;
size_t max_cache;

size_t lockstep_width;

Expand Down Expand Up @@ -457,6 +458,12 @@ static void cuda_nvrtc_mk_build_options(struct futhark_context *ctx, const char
opts[i++] = msgprintf("-D%s=%d",
"max_group_size",
(int)ctx->max_group_size);
opts[i++] = msgprintf("-D%s=%d",
"max_local_memory",
(int)ctx->max_local_memory);
opts[i++] = msgprintf("-D%s=%d",
"max_registers",
(int)ctx->max_registers);
for (int j = 0; j < cfg->num_tuning_params; j++) {
opts[i++] = msgprintf("-D%s=%zu", cfg->tuning_param_vars[j],
cfg->tuning_params[j]);
Expand Down Expand Up @@ -804,6 +811,8 @@ int backend_context_setup(struct futhark_context* ctx) {
ctx->max_tile_size = sqrt(ctx->max_group_size);
ctx->max_threshold = 0;
ctx->max_bespoke = 0;
ctx->max_registers = device_query(ctx->dev, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK);
ctx->max_cache = device_query(ctx->dev, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE);
ctx->lockstep_width = device_query(ctx->dev, WARP_SIZE);
CUDA_SUCCEED_FATAL(cuStreamCreate(&ctx->stream, CU_STREAM_DEFAULT));
cuda_size_setup(ctx);
Expand Down
10 changes: 10 additions & 0 deletions rts/c/backends/hip.h
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,8 @@ struct futhark_context {
size_t max_threshold;
size_t max_local_memory;
size_t max_bespoke;
size_t max_registers;
size_t max_cache;

size_t lockstep_width;

Expand Down Expand Up @@ -473,6 +475,12 @@ static void hiprtc_mk_build_options(struct futhark_context *ctx, const char *ext
opts[i++] = msgprintf("-D%s=%d",
"max_group_size",
(int)ctx->max_group_size);
opts[i++] = msgprintf("-D%s=%d",
"max_local_memory",
(int)ctx->max_local_memory);
opts[i++] = msgprintf("-D%s=%d",
"max_registers",
(int)ctx->max_registers);
for (int j = 0; j < cfg->num_tuning_params; j++) {
opts[i++] = msgprintf("-D%s=%zu", cfg->tuning_param_vars[j],
cfg->tuning_params[j]);
Expand Down Expand Up @@ -659,6 +667,8 @@ int backend_context_setup(struct futhark_context* ctx) {
ctx->max_tile_size = sqrt(ctx->max_group_size);
ctx->max_threshold = 0;
ctx->max_bespoke = 0;
ctx->max_registers = device_query(ctx->dev, hipDeviceAttributeMaxRegistersPerBlock);
ctx->max_cache = device_query(ctx->dev, hipDeviceAttributeL2CacheSize);
// FIXME: in principle we should query hipDeviceAttributeWarpSize
// from the device, which will provide 64 on AMD GPUs.
// Unfortunately, we currently do nasty implicit intra-warp
Expand Down
20 changes: 20 additions & 0 deletions rts/c/backends/opencl.h
Original file line number Diff line number Diff line change
Expand Up @@ -528,6 +528,8 @@ struct futhark_context {
size_t max_tile_size;
size_t max_threshold;
size_t max_local_memory;
size_t max_registers;
size_t max_cache;

size_t lockstep_width;

Expand Down Expand Up @@ -594,6 +596,16 @@ static char* mk_compile_opts(struct futhark_context *ctx,
"max_group_size",
(int)ctx->max_group_size);

w += snprintf(compile_opts+w, compile_opts_size-w,
"-D%s=%d ",
"max_local_memory",
(int)ctx->max_local_memory);

w += snprintf(compile_opts+w, compile_opts_size-w,
"-D%s=%d ",
"max_registers",
(int)ctx->max_registers);

for (int i = 0; i < ctx->cfg->num_tuning_params; i++) {
w += snprintf(compile_opts+w, compile_opts_size-w,
"-D%s=%d ",
Expand Down Expand Up @@ -785,6 +797,14 @@ static void setup_opencl_with_command_queue(struct futhark_context *ctx,
ctx->cfg->default_tile_size = max_tile_size;
}


cl_ulong cache_size;
OPENCL_SUCCEED_FATAL(clGetDeviceInfo(device_option.device, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE,
sizeof(cache_size), &cache_size, NULL));
ctx->max_cache = cache_size;

ctx->max_registers = 1<<16; // I cannot find a way to query for this.

ctx->max_group_size = max_group_size;
ctx->max_tile_size = max_tile_size; // No limit.
ctx->max_threshold = ctx->max_num_groups = 0; // No limit.
Expand Down
10 changes: 1 addition & 9 deletions src/Futhark/CodeGen/ImpGen/GPU/SegHist.hs
Original file line number Diff line number Diff line change
Expand Up @@ -243,17 +243,9 @@ prepareIntermediateArraysGlobal passage segments hist_T hist_N slugs = do
t64 $
r64 hist_T / hist_C_max

-- Querying L2 cache size is not reliable. Instead we provide a
-- tunable knob with a hopefully sane default.
let hist_L2_def = 4 * 1024 * 1024
hist_L2 <- dPrim "L2_size" int32
entry <- askFunction
-- Equivalent to F_L2*L2 in paper.
sOp
$ Imp.GetSize
(tvVar hist_L2)
(keyWithEntryPoint entry $ nameFromString (prettyString (tvVar hist_L2)))
$ Imp.SizeBespoke (nameFromString "L2_for_histogram") hist_L2_def
sOp $ Imp.GetSizeMax (tvVar hist_L2) $ Imp.SizeCache

let hist_L2_ln_sz = 16 * 4 -- L2 cache line size approximation
hist_RACE_exp <-
Expand Down
8 changes: 8 additions & 0 deletions src/Futhark/IR/GPU/Sizes.hs
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,12 @@ data SizeClass
SizeLocalMemory
| -- | A bespoke size with a default.
SizeBespoke Name Int64
| -- | Amount of registers available per workgroup. Mostly
-- meaningful for querying the maximum.
SizeRegisters
| -- | Amount of L2 cache memory, in bytes. Mostly meaningful for
-- querying the maximum.
SizeCache
deriving (Eq, Ord, Show)

instance Pretty SizeClass where
Expand All @@ -54,6 +60,8 @@ instance Pretty SizeClass where
pretty SizeLocalMemory = "local_memory"
pretty (SizeBespoke k def) =
"bespoke" <> parens (pretty k <> comma <+> pretty def)
pretty SizeRegisters = "registers"
pretty SizeCache = "cache"

-- | The default value for the size. If 'Nothing', that means the backend gets to decide.
sizeDefault :: SizeClass -> Maybe Int64
Expand Down

0 comments on commit 6f15eba

Please sign in to comment.