Skip to content

Commit

Permalink
refs #6: Fix a memory allocation bug in GPU IO buffers.
Browse files Browse the repository at this point in the history
 * We need to return the pointer "before" adding the newly allocated
   size (in lib/mempool.hh).

 * Now GPU does not crash, but it still hangs after processing the first
   offload task.
  • Loading branch information
achimnol committed May 28, 2015
1 parent 2752d8b commit 3e6202e
Show file tree
Hide file tree
Showing 8 changed files with 22 additions and 43 deletions.
12 changes: 4 additions & 8 deletions engines/cuda/compat.hh
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,10 @@ struct datablock_kernel_arg {
void *buffer_bases_out[NBA_MAX_COPROC_PPDEPTH];
uint32_t item_count_in[NBA_MAX_COPROC_PPDEPTH];
uint32_t item_count_out[NBA_MAX_COPROC_PPDEPTH];
union {
uint16_t item_size_in;
uint16_t *item_sizes_in[NBA_MAX_COPROC_PPDEPTH];
};
union {
uint16_t item_size_out;
uint16_t *item_sizes_out[NBA_MAX_COPROC_PPDEPTH];
};
uint16_t item_size_in;
uint16_t *item_sizes_in[NBA_MAX_COPROC_PPDEPTH];
uint16_t item_size_out;
uint16_t *item_sizes_out[NBA_MAX_COPROC_PPDEPTH];
uint16_t *item_offsets_in[NBA_MAX_COPROC_PPDEPTH];
uint16_t *item_offsets_out[NBA_MAX_COPROC_PPDEPTH];
};
Expand Down
4 changes: 2 additions & 2 deletions engines/cuda/mempool.hh
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ public:
size_t offset;
int ret = _alloc(size, &offset);
if (ret == 0)
return (void *) ((uint8_t *) base_ + offset);
return (void *) ((uint8_t *) base_ + (uintptr_t) offset);
return NULL;
}

Expand Down Expand Up @@ -80,7 +80,7 @@ public:
size_t offset;
int ret = _alloc(size, &offset);
if (ret == 0)
return (void *) ((uint8_t *) base_ + offset);
return (void *) ((uint8_t *) base_ + (uintptr_t) offset);
return NULL;
}

Expand Down
2 changes: 1 addition & 1 deletion lib/config.hh
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
#define NBA_MAX_DATABLOCKS (12) // If too large (e.g., 64), batch_pool can not be allocated.

#define NBA_OQ (true) // Use output-queuing semantics when possible.
#define NBA_CPU_MICROBENCH // Enable support for PAPI library for microbenchmarks.
#undef NBA_CPU_MICROBENCH // Enable support for PAPI library for microbenchmarks.

namespace nba {

Expand Down
1 change: 0 additions & 1 deletion lib/datablock.cc
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,6 @@ tuple<size_t, size_t> DataBlock::calc_write_buffer_size(PacketBatch *batch)
case WRITE_FIXED_SEGMENTS: {

num_write_items = batch->count;
//write_item_sizes.size = write_roi.length;
write_buffer_size = write_roi.length * num_write_items;

break; }
Expand Down
12 changes: 4 additions & 8 deletions lib/datablock.hh
Original file line number Diff line number Diff line change
Expand Up @@ -117,14 +117,10 @@ struct datablock_kernel_arg {
void *buffer_bases_out[NBA_MAX_COPROC_PPDEPTH];
uint32_t item_count_in[NBA_MAX_COPROC_PPDEPTH];
uint32_t item_count_out[NBA_MAX_COPROC_PPDEPTH];
union {
uint16_t item_size_in;
uint16_t *item_sizes_in[NBA_MAX_COPROC_PPDEPTH];
};
union {
uint16_t item_size_out;
uint16_t *item_sizes_out[NBA_MAX_COPROC_PPDEPTH];
};
uint16_t item_size_in;
uint16_t *item_sizes_in[NBA_MAX_COPROC_PPDEPTH];
uint16_t item_size_out;
uint16_t *item_sizes_out[NBA_MAX_COPROC_PPDEPTH];
uint16_t *item_offsets_in[NBA_MAX_COPROC_PPDEPTH];
uint16_t *item_offsets_out[NBA_MAX_COPROC_PPDEPTH];
}; // __attribute__((aligned(8)));
Expand Down
4 changes: 2 additions & 2 deletions lib/elementgraph.cc
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,10 @@ void ElementGraph::flush_offloaded_tasks()
/* Prepare to offload. */
cctx->state = ComputeContext::PREPARING;
cctx->currently_running_task = task;
// FIXME: dedicate a single cctx to each computation thread
// (COPROC_CTX_PER_COMPTHREAD 설정이 1이면 괜찮지만 아예 1로 고정할 것.)
task->cctx = cctx;

/* In the GPU side, datablocks argument has only used
* datablocks in the beginning of the array (not sparsely). */
int datablock_ids[NBA_MAX_DATABLOCKS];
size_t num_db_used = task->elem->get_used_datablocks(datablock_ids);
for (unsigned k = 0; k < num_db_used; k++) {
Expand Down
4 changes: 2 additions & 2 deletions lib/mempool.hh
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ public:
{
if (curpos_ + size > max_size_)
return -1;
curpos_ = __ALIGN(curpos_, 64);
if (start_offset)
/* IMPORTANT: We need to return the position before adding the new size. */
if (start_offset != nullptr)
*start_offset = curpos_;
curpos_ += size;
curpos_ = __ALIGN(curpos_, 64);
Expand Down
26 changes: 7 additions & 19 deletions lib/offloadtask.cc
Original file line number Diff line number Diff line change
Expand Up @@ -121,23 +121,24 @@ bool OffloadTask::copy_h2d()
bool has_h2d_copies = false;

/* Copy the datablock information for the first kernel argument. */
size_t dbarray_size = ALIGN(sizeof(struct datablock_kernel_arg) * NBA_MAX_DATABLOCKS, CACHE_LINE_SIZE);
size_t dbarray_size = ALIGN(sizeof(struct datablock_kernel_arg) * datablocks.size(), CACHE_LINE_SIZE);
cctx->alloc_input_buffer(dbarray_size, (void **) &dbarray_h, &dbarray_d);
assert(dbarray_h != nullptr);
size_t itemszarray_size = 0;

for (int dbid : datablocks) {
int b = 0;
int dbid_d = dbid_h2d[dbid];
dbarray_h[dbid_d].total_item_count_in = 0;
dbarray_h[dbid_d].total_item_count_out = 0;
assert(dbid_d < datablocks.size());

DataBlock *db = comp_ctx->datablock_registry[dbid];
struct read_roi_info rri;
struct write_roi_info wri;
db->get_read_roi(&rri);
db->get_write_roi(&wri);

int b = 0;
for (PacketBatch *batch : batches) {
struct datablock_tracker *t = &batch->datablock_states[dbid];

Expand Down Expand Up @@ -204,21 +205,18 @@ bool OffloadTask::copy_h2d()

/* Coalesced H2D data copy. */
void *first_host_in_ptr = nullptr;
char printbuf[4096];
char *pb = &printbuf[0];
int copies = 0;
memory_t first_dev_in_ptr;
size_t total_size = 0;
pb += (intptr_t) sprintf(pb, "Host-to-device copy:\n");
for (int dbid : datablocks) {
int b = 0;
for (PacketBatch *batch : batches) {
struct datablock_tracker *t = &batch->datablock_states[dbid];
if (t == nullptr || t->host_in_ptr == nullptr || t->in_count == 0 || t->in_size == 0) {
#ifdef COALESC_COPY
if (first_host_in_ptr != nullptr) {
/* Discontinued copy. */
cctx->enqueue_memwrite_op(first_host_in_ptr, first_dev_in_ptr, 0, total_size);
pb += (intptr_t) sprintf(pb, " [%d] %p - %p\n", copies++, first_host_in_ptr, (char*)first_host_in_ptr + (uintptr_t)total_size);
/* Reset. */
first_host_in_ptr = nullptr;
total_size = 0;
Expand All @@ -232,21 +230,19 @@ bool OffloadTask::copy_h2d()
first_host_in_ptr = t->host_in_ptr;
first_dev_in_ptr = t->dev_in_ptr;
}
total_size += t->in_size;
total_size += ALIGN(t->in_size, CACHE_LINE_SIZE);
#ifndef COALESC_COPY
cctx->enqueue_memwrite_op(t->host_in_ptr, t->dev_in_ptr, 0, t->in_size);
#endif
//printf("%p - %p\n", t->host_in_ptr, (void *)((char*) t->host_in_ptr + t->in_size));
has_h2d_copies = true;
b++;
}
}
#ifdef COALESC_COPY
if (first_host_in_ptr != nullptr) {
/* Finished copy. */
cctx->enqueue_memwrite_op(first_host_in_ptr, first_dev_in_ptr, 0, total_size);
pb += (intptr_t) sprintf(pb, " [%d] %p - %p\n", copies++, first_host_in_ptr, (char*)first_host_in_ptr + (uintptr_t)total_size);
}
//printf("%s", printbuf);
#endif
return has_h2d_copies;
}
Expand Down Expand Up @@ -298,8 +294,6 @@ void OffloadTask::execute()
}
batch_id ++;
}
//cctx->enqueue_memwrite_op(batch_ids_h, batch_ids_d, 0, sizeof(uint16_t) * all_item_count);
//cctx->enqueue_memwrite_op(item_ids_h, item_ids_d, 0, sizeof(uint16_t) * all_item_count);
cctx->enqueue_memwrite_op(batch_ids_h, batch_ids_d, 0, ALIGN(sizeof(uint16_t) * all_item_count, CACHE_LINE_SIZE) * 2);

cctx->clear_checkbits();
Expand Down Expand Up @@ -340,12 +334,9 @@ bool OffloadTask::copy_d2h()
/* Coalesced D2H data copy. */
bool has_d2h_copies = false;
void *first_host_out_ptr = nullptr;
char printbuf[4096];
char *pb = &printbuf[0];
int copies = 0;
memory_t first_dev_out_ptr;
size_t total_size = 0;
pb += (intptr_t) sprintf(pb, "Device-to-host copy:\n");
for (int dbid : datablocks) {
DataBlock *db = comp_ctx->datablock_registry[dbid];
for (PacketBatch *batch : batches) {
Expand All @@ -355,7 +346,6 @@ bool OffloadTask::copy_d2h()
if (first_host_out_ptr != nullptr) {
/* Discontinued copy. */
cctx->enqueue_memread_op(first_host_out_ptr, first_dev_out_ptr, 0, total_size);
pb += (intptr_t) sprintf(pb, " [%d] %p - %p\n", copies++, first_host_out_ptr, (char*)first_host_out_ptr + (uintptr_t)total_size);
/* Reset. */
first_host_out_ptr = nullptr;
total_size = 0;
Expand All @@ -369,7 +359,7 @@ bool OffloadTask::copy_d2h()
first_host_out_ptr = t->host_out_ptr;
first_dev_out_ptr = t->dev_out_ptr;
}
total_size += t->out_size;
total_size += ALIGN(t->out_size, CACHE_LINE_SIZE);
#ifndef COALESC_COPY
cctx->enqueue_memread_op(t->host_out_ptr, t->dev_out_ptr, 0, t->out_size);
#endif
Expand All @@ -380,9 +370,7 @@ bool OffloadTask::copy_d2h()
if (first_host_out_ptr != nullptr) {
/* Finished copy. */
cctx->enqueue_memread_op(first_host_out_ptr, first_dev_out_ptr, 0, total_size);
pb += (intptr_t) sprintf(pb, " [%d] %p - %p\n", copies++, first_host_out_ptr, (char*)first_host_out_ptr + (uintptr_t)total_size);
}
//printf("%s", printbuf);
#endif
return has_d2h_copies;
}
Expand Down

0 comments on commit 3e6202e

Please sign in to comment.