Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sort multiple run bugs #300

Merged
merged 25 commits into from
Jun 23, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
9c02ce3
Remove nvprof start/stop code
denisalevi Jun 2, 2022
022412e
Add comment in benchmark script for nvprof
denisalevi Jun 2, 2022
da58c6c
Reorder collection multisynaptic indices and host only variables
denisalevi Jun 7, 2022
416cd92
Remove all host->device copied for host only vars
denisalevi Jun 7, 2022
367cf3e
First idea to copy delay back to device if needed
denisalevi Jun 8, 2022
0d6ba69
Revert "First idea to copy delay back to device if needed"
denisalevi Jun 8, 2022
651043a
Only delete delay on device if not used between run calls
denisalevi Jun 8, 2022
2c36a2c
Remove time variables to device copies
denisalevi Jun 8, 2022
421b801
Fix detection when delay is set after run
denisalevi Jun 8, 2022
36771e9
Merge branch 'master' into sort-multiple-run-bugs
denisalevi Jun 10, 2022
df739ec
Expose device pointer in copy to symbol macro
denisalevi Jun 13, 2022
2a1b579
Fix spike queue allocation for multiple runs
denisalevi Jun 15, 2022
bc34442
Add tests for reducing heterogeneous delays between runs
denisalevi Jun 15, 2022
7174deb
Move eventspace initialization from device.py to object.cu
denisalevi Jun 22, 2022
6ba4660
Correctly rotate and add new eventspaces when delay changed
denisalevi Jun 22, 2022
ea32da8
Simplify setting eventspace index for effect application
denisalevi Jun 22, 2022
8a09805
Remove some memory leaks for multiple `run` calls
denisalevi Jun 22, 2022
64877eb
Add xfail tests for changing delay or dt between runs
denisalevi Jun 22, 2022
1c97697
Remove multiple run warning
denisalevi Jun 22, 2022
ecf9f2e
Fix delay availability after run call
denisalevi Jun 22, 2022
d12a15d
Fix spikequeue assert for many delays
denisalevi Jun 22, 2022
0020cf9
Fix tests to pass for unsorted `SpikeMonitor`
denisalevi Jun 22, 2022
2851c82
Revert "Simplify setting eventspace index for effect application"
denisalevi Jun 23, 2022
7533518
Fix test dimension error
denisalevi Jun 23, 2022
f9d66d3
Add circular evenstpace clock test with NeuronGroup
denisalevi Jun 23, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 89 additions & 26 deletions brian2cuda/brianlib/spikequeue.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ using namespace std;
// variables (delays, dt) are assumed to use the same data type
typedef int32_t DTYPE_int;

typedef cudaVector<DTYPE_int> cuda_vector;

class CudaSpikeQueue
{
private:
Expand All @@ -31,7 +33,7 @@ class CudaSpikeQueue

public:
//these vectors should ALWAYS be the same size, since each index refers to a triple of (pre_id, syn_id, post_id)
cudaVector<DTYPE_int>** synapses_queue;
cuda_vector** synapses_queue;

//our connectivity matrix with dimensions (num_blocks) * neuron_N
//each element
Expand All @@ -47,22 +49,26 @@ class CudaSpikeQueue
int* unique_delay_start_idcs;
int current_offset; // offset in circular queue structure
int num_queues;
int num_delays;
//int max_num_delays_per_block;
int num_blocks;
int neuron_N; // number of neurons in source of SynapticPathway
int syn_N;

// When we have 0 synapses, prepare() is not called in synapses_initialise_queue.cu
// and for destroy() to still work, synapses_queue needs to be a null pointer
__device__ CudaSpikeQueue(): synapses_queue(0) {};
// When we have 0 synapses, prepare() is not called in
// before_run_synapses_push_spikes and for destroy() to still work,
// synapses_queue needs to be a null pointer
__device__ CudaSpikeQueue(): synapses_queue(0), semaphore(0), num_queues(0) {};

//Since we can't have a destructor, we need to call this function manually
__device__ void destroy()
{
if(synapses_queue)
{
delete [] synapses_queue;
delete [] semaphore;
synapses_queue = 0;
semaphore = 0;
}
}

Expand All @@ -76,7 +82,7 @@ class CudaSpikeQueue
double _dt,
int _neuron_N,
int _syn_N,
int _num_queues,
int _num_delays,
int* _num_synapses_by_pre,
int* _num_synapses_by_bundle,
int* _num_unique_delays_by_pre,
Expand All @@ -89,18 +95,59 @@ class CudaSpikeQueue
int* _unique_delay_start_idcs
)
{
if(tid == 0)
// read queue information from a previous run
// (these are all null at the first run)
int old_num_queues = num_queues;
int required_num_queues = _num_delays + 1;
cuda_vector** old_synapses_queue = synapses_queue;
bool require_new_queues = (required_num_queues > old_num_queues);
int old_current_offset = current_offset;
bool initialize_semaphores = (!semaphore);

if (tid == 0)
{
// TODO add comments
// allocate semaphore memory only at first prepare() call
if (initialize_semaphores)
{
semaphore = new int[_num_blocks];
}

// only allocate queue pointer memory if the number of queues increased
if (require_new_queues)
{
synapses_queue = new cuda_vector*[required_num_queues];
if (!synapses_queue)
{
printf("ERROR while allocating memory with size %ld in"
" spikequeue.h/prepare()\n",
sizeof(cuda_vector*) * required_num_queues);
}
// only reset queue offset if we require new queues, in which
// case we copy the old queues such that the offset is reset
// (if there are no new queues, the queues remain as they are)
current_offset = 0;
}

semaphore = new int[_num_blocks];
current_offset = 0;
// set class attributes
assert(num_threads <= required_num_queues); // else parallel loop fails below
if (!initialize_semaphores)
{
assert(_num_blocks == num_blocks); // can't change between runs
}
num_blocks = _num_blocks;
neuron_N = _neuron_N;
syn_N = _syn_N;
num_queues = _num_queues;
num_delays = _num_delays;
// we only add queues, but never remove queues (because we could
// loose spikes in the queues)
if (require_new_queues)
{
num_queues = required_num_queues;
}

// TODO: do we need num_synapses_by_pre? is num_synapses_by_pre[pre_post_block_id] faster then synapses_by_pre[pre_post_block_id].size()?
// TODO: do we need num_synapses_by_pre? is
// num_synapses_by_pre[pre_post_block_id] faster then
// synapses_by_pre[pre_post_block_id].size()?
// if so, add unique_num_synapses_by_pre as well!
num_synapses_by_pre = _num_synapses_by_pre;
num_synapses_by_bundle = _num_synapses_by_bundle;
Expand All @@ -113,27 +160,45 @@ class CudaSpikeQueue
unique_delays_offset_by_pre = _unique_delays_offset_by_pre;
unique_delay_start_idcs = _unique_delay_start_idcs;

synapses_queue = new cudaVector<DTYPE_int>*[num_queues];
if(!synapses_queue)
{
printf("ERROR while allocating memory with size %ld in spikequeue.h/prepare()\n", sizeof(cudaVector<DTYPE_int>*)*num_queues);
}
}
__syncthreads();

for (int i = tid; i < _num_blocks; i+=num_threads)
// initialize semaphores only if they were not initalized before
if (initialize_semaphores)
{
semaphore[i] = 0;
for (int i = tid; i < _num_blocks; i+=num_threads)
{
semaphore[i] = 0;
}
}

for(int i = tid; i < num_queues; i+=num_threads)
// setup the new queues
if (require_new_queues)
{
synapses_queue[i] = new cudaVector<DTYPE_int>[num_blocks];
if(!synapses_queue[i])
// copy old queues over to new queue array
for (int i = tid; i < required_num_queues; i += num_threads)
{
printf("ERROR while allocating memory with size %ld in spikequeue.h/prepare()\n", sizeof(cudaVector<DTYPE_int>)*num_blocks);
if (i < old_num_queues)
{
// copy the old queues to the new array, such that the
// offset is reset back to the start (current_offset is set
// to zero above)
int old_i = (i + old_current_offset) % old_num_queues;
synapses_queue[i] = old_synapses_queue[old_i];
} else
{
// allocate new memory for cudaVectors of new queues
synapses_queue[i] = new cuda_vector[num_blocks];
if (!synapses_queue[i])
{
printf("ERROR while allocating memory with size %ld in"
" spikequeue.h/prepare()\n",
sizeof(cuda_vector)*num_blocks);
}
}
}
}

};

__device__ void push_synapses(
Expand Down Expand Up @@ -436,8 +501,7 @@ class CudaSpikeQueue

} // end push_bundles()

__device__ void advance(
int tid)
__device__ void advance(int tid)
{
assert(tid < num_blocks && current_offset < num_queues);
synapses_queue[current_offset][tid].reset();
Expand All @@ -446,8 +510,7 @@ class CudaSpikeQueue
current_offset = (current_offset + 1)%num_queues;
}

__device__ void peek(
cudaVector<DTYPE_int>** _synapses_queue)
__device__ void peek(cuda_vector** _synapses_queue)
{
*(_synapses_queue) = &(synapses_queue[current_offset][0]);
}
Expand Down
Loading