Skip to content

Commit

Permalink
Add preference to manually set default threads per block
Browse files Browse the repository at this point in the history
It appears that the number of threads we pick is not always giving
optimal occupancy, see #266
  • Loading branch information
denisalevi committed Feb 13, 2022
1 parent 9f13ea2 commit d28f289
Show file tree
Hide file tree
Showing 4 changed files with 21 additions and 4 deletions.
10 changes: 9 additions & 1 deletion brian2cuda/cuda_prefs.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,15 @@ def validate_bundle_size_expression(string):
application. Since this avoids race conditions, effect application can
be parallelised.''',
validator=lambda v: isinstance(v, bool),
default=True)
default=True),

default_threads_per_block=BrianPreference(
docs='''If set, this overwrites the threads per block chosen by
`cudaOccupancyMaxPotentialBlockSize`, which appears to not always choose the
optimal threads per block. This needs fixing, see #266.''',
validator=lambda v: isinstance(v, int) or v is None,
default=None
)
)

prefs.register_preferences(
Expand Down
1 change: 1 addition & 0 deletions brian2cuda/device.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,7 @@ def code_object(self, owner, name, abstract_code, variables, template_name,
template_kwds = dict(template_kwds)
template_kwds['profiled'] = self.enable_profiling
template_kwds['bundle_mode'] = prefs["devices.cuda_standalone.push_synapse_bundles"]
template_kwds['default_threads_per_block'] = prefs["devices.cuda_standalone.default_threads_per_block"]
no_or_const_delay_mode = False
if isinstance(owner, (SynapticPathway, Synapses)) and "delay" in owner.variables and owner.variables["delay"].scalar:
# catches Synapses(..., delay=...) syntax, does not catch the case when no delay is specified at all
Expand Down
8 changes: 6 additions & 2 deletions brian2cuda/templates/common_group.cu
Original file line number Diff line number Diff line change
Expand Up @@ -187,14 +187,18 @@ void _run_{{codeobj_name}}()
{% block prepare_kernel_inner %}
// get number of blocks and threads
{% if calc_occupancy %}
{% if default_threads_per_block %}
num_threads = {{default_threads_per_block}};
{% else %}
int min_num_blocks; // The minimum grid size needed to achieve the
// maximum occupancy for a full device launch
// maximum occupancy for a full device launch
CUDA_SAFE_CALL(
cudaOccupancyMaxPotentialBlockSize(&min_num_blocks, &num_threads,
_run_kernel_{{codeobj_name}}, 0, 0) // last args: dynamicSMemSize, blockSizeLimit
);
{% endif %}
// Round up according to array size
Expand Down
6 changes: 5 additions & 1 deletion brian2cuda/templates/spatialstateupdate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -492,13 +492,17 @@ __global__ void _currents_kernel_{{codeobj_name}}(
// calculate number of threads that maximize occupancy
// and also the corresponding number of blocks
// the code below is adapted from common_group.cu
{% if default_threads_per_block %}
num_threads_currents = {{default_threads_per_block}};
{% else %}
int min_num_blocks_currents; // The minimum grid size needed to achieve the
// maximum occupancy for a full device launch
// maximum occupancy for a full device launch

CUDA_SAFE_CALL(
cudaOccupancyMaxPotentialBlockSize(&min_num_blocks_currents, &num_threads_currents,
_currents_kernel_{{codeobj_name}}, 0, 0) // last args: dynamicSMemSize, blockSizeLimit
);
{% endif %}

// Round up according to array size
num_blocks_currents = (_N + num_threads_currents - 1) / num_threads_currents;
Expand Down

0 comments on commit d28f289

Please sign in to comment.