Skip to content

Commit

Permalink
refs #6: Fix some logical bugs and got twice performance!
Browse files Browse the repository at this point in the history
 * Some operations were not being executed properly when checking
   TASK_PREPARED status in send_offload_task_to_device().

 * Still, it needs to be improved more....
  • Loading branch information
achimnol committed Jan 13, 2016
1 parent ae60d38 commit edfe539
Show file tree
Hide file tree
Showing 6 changed files with 43 additions and 34 deletions.
2 changes: 1 addition & 1 deletion include/nba/framework/config.hh
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@
#define NBA_OQ (true) // Use output-queuing semantics when possible.
#undef NBA_CPU_MICROBENCH // Enable support for PAPI library for microbenchmarks.

#define NBA_REUSE_DATABLOCKS (1)
#define NBA_REUSE_DATABLOCKS

/* If you change below, update HANDLE_ALL_PORTS macro in lib/element.hh as well!! */
#define NBA_MAX_ELEM_NEXTS (4)
Expand Down
1 change: 1 addition & 0 deletions include/nba/framework/elementgraph.hh
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ public:
void scan_schedulable_elements(uint64_t loop_count);

/* Scans and executes dispatch() handlers of offloadable elements.
* It is a shorthand version that ignores next_delay output arguments.
* This fetches the GPU-processed batches and feed them into the graph
* again. */
void scan_offloadable_elements(uint64_t loop_count);
Expand Down
2 changes: 1 addition & 1 deletion src/engines/cuda/computecontext.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ CUDAComputeContext::CUDAComputeContext(unsigned ctx_id, ComputeDevice *mother_de
/* NOTE: Write-combined memory degrades performance to half... */
{
type_name = "cuda";
size_t io_base_size = 5 * 1024 * 1024; // TODO: read from config
size_t io_base_size = 4 * 1024 * 1024; // TODO: read from config
cutilSafeCall(cudaStreamCreateWithFlags(&_stream, cudaStreamNonBlocking));
io_base_ring.init(NBA_MAX_IO_BASES, node_id, io_base_ring_buf);
for (unsigned i = 0; i < NBA_MAX_IO_BASES; i++) {
Expand Down
2 changes: 2 additions & 0 deletions src/lib/computation.cc
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ void comp_thread_context::build_element_graph(const char* config_file)
}
}
RTE_LOG(INFO, ELEM, "Number of linear groups: %lu\n", linear_groups.size());
#ifdef NBA_REUSE_DATABLOCKS
for (vector<GraphMetaData *> group : linear_groups) {

Element *prev_el = nullptr;
Expand Down Expand Up @@ -254,6 +255,7 @@ void comp_thread_context::build_element_graph(const char* config_file)
}
// TODO: 여기서 한번 더 체크해줘야 offloadable이 맨 마지막인 경우 제대로 처리될 것.
}
#endif
click_destroy_configuration(pi);
fclose(input);
elemgraph_lock->release();
Expand Down
66 changes: 34 additions & 32 deletions src/lib/elementgraph.cc
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ ElementGraph::ElementGraph(comp_thread_context *ctx)
input_elem = nullptr;
assert(0 == rte_malloc_validate(ctx, NULL));

#ifdef NBA_REUSE_DATABLOCKS
struct rte_hash_parameters hparams;
char namebuf[RTE_HASH_NAMESIZE];
sprintf(namebuf, "elemgraph@%u.%u:offl_actions", ctx->loc.node_id, ctx->loc.local_thread_idx);
Expand All @@ -50,6 +51,9 @@ ElementGraph::ElementGraph(comp_thread_context *ctx)
hparams.socket_id = ctx->loc.node_id;
offl_actions = rte_hash_create(&hparams);
assert(offl_actions != nullptr);
#else
offl_actions = nullptr;
#endif
}

void ElementGraph::send_offload_task_to_device(OffloadTask *task)
Expand All @@ -69,45 +73,43 @@ void ElementGraph::send_offload_task_to_device(OffloadTask *task)

/* Prepare to offload. */
if (task->state < TASK_PREPARED) {
//bool had_io_base = (task->io_base != INVALID_IO_BASE);
bool has_io_base = false;
if (task->io_base == INVALID_IO_BASE) {
task->io_base = cctx->alloc_io_base();
has_io_base = (task->io_base != INVALID_IO_BASE);
/* In the GPU side, datablocks argument has only used
* datablocks in the beginning of the array (not sparsely). */
int datablock_ids[NBA_MAX_DATABLOCKS];
size_t num_db_used = task->elem->get_used_datablocks(datablock_ids);
for (unsigned k = 0; k < num_db_used; k++) {
int dbid = datablock_ids[k];
task->datablocks.push_back(dbid);
task->dbid_h2d[dbid] = k;
}
if (has_io_base) {
/* In the GPU side, datablocks argument has only used
* datablocks in the beginning of the array (not sparsely). */
int datablock_ids[NBA_MAX_DATABLOCKS];
size_t num_db_used = task->elem->get_used_datablocks(datablock_ids);
for (unsigned k = 0; k < num_db_used; k++) {
int dbid = datablock_ids[k];
task->datablocks.push_back(dbid);
task->dbid_h2d[dbid] = k;
}

size_t num_batches = task->batches.size();
/* As we reuse tasks between subsequent offloadables
* and only does in linear groups of elements,
* it is okay to check only the first batch. */
if (task->batches[0]->datablock_states == nullptr) {
void *dbstates[num_batches];
int bidx = 0;
assert(0 == rte_mempool_get_bulk(ctx->dbstate_pool, (void **) &dbstates,
num_batches));
for (PacketBatch *batch : task->batches) {
batch->datablock_states = (struct datablock_tracker *) dbstates[bidx];
bidx ++;
}
size_t num_batches = task->batches.size();
/* As we reuse tasks between subsequent offloadables
* and only does in linear groups of elements,
* it is okay to check only the first batch. */
if (task->batches[0]->datablock_states == nullptr) {
void *dbstates[num_batches];
int bidx = 0;
assert(0 == rte_mempool_get_bulk(ctx->dbstate_pool, (void **) &dbstates,
num_batches));
for (PacketBatch *batch : task->batches) {
batch->datablock_states = (struct datablock_tracker *) dbstates[bidx];
bidx ++;
}
task->offload_start = 0;
}
task->offload_start = 0;

if (task->io_base == INVALID_IO_BASE) {
task->io_base = cctx->alloc_io_base();
bool has_io_base = (task->io_base != INVALID_IO_BASE);
assert(has_io_base);
/* Calculate required buffer sizes, allocate them, and initialize them.
* The mother buffer is statically allocated on start-up and here we
* reserve regions inside it. */
task->prepare_read_buffer();
task->prepare_write_buffer();
} /* endif(has_io_base) */
} /* endif(!has_io_base) */

task->prepare_read_buffer();
task->prepare_write_buffer();
task->state = TASK_PREPARED;
} /* endif(!task.prepared) */

Expand Down
4 changes: 4 additions & 0 deletions src/lib/io.cc
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ static void comp_offload_task_completion_cb(struct ev_loop *loop, struct ev_asyn
uint64_t total_batch_size = 0;
for (size_t b = 0, b_max = task->batches.size(); b < b_max; b ++)
total_batch_size += task->batches[b]->count;
#ifdef NBA_REUSE_DATABLOCKS
if (ctx->elem_graph->check_next_offloadable(task->elem)) {
for (size_t b = 0, b_max = task->batches.size(); b < b_max; b ++) {
task->batches[b]->compute_time += (uint64_t)
Expand All @@ -160,6 +161,9 @@ static void comp_offload_task_completion_cb(struct ev_loop *loop, struct ev_asyn
0);
/* This task is reused. We keep them intact. */
} else {
#else
{
#endif
for (size_t b = 0, b_max = task->batches.size(); b < b_max; b ++) {
task->batches[b]->compute_time += (uint64_t)
((float) task_cycles / total_batch_size
Expand Down

0 comments on commit edfe539

Please sign in to comment.