From edfe53924da0b8edeaa5410052c43b92172e015a Mon Sep 17 00:00:00 2001 From: Joongi Kim Date: Thu, 14 Jan 2016 00:31:40 +0900 Subject: [PATCH] refs #6: Fix some logical bugs and got twice performance! * Some operations were not being executed properly when checking TASK_PREPARED status in send_offload_task_to_device(). * Still, it needs to be improved more.... --- include/nba/framework/config.hh | 2 +- include/nba/framework/elementgraph.hh | 1 + src/engines/cuda/computecontext.cc | 2 +- src/lib/computation.cc | 2 + src/lib/elementgraph.cc | 66 ++++++++++++++------------- src/lib/io.cc | 4 ++ 6 files changed, 43 insertions(+), 34 deletions(-) diff --git a/include/nba/framework/config.hh b/include/nba/framework/config.hh index 0173dc7..9f9f9ed 100644 --- a/include/nba/framework/config.hh +++ b/include/nba/framework/config.hh @@ -57,7 +57,7 @@ #define NBA_OQ (true) // Use output-queuing semantics when possible. #undef NBA_CPU_MICROBENCH // Enable support for PAPI library for microbenchmarks. -#define NBA_REUSE_DATABLOCKS (1) +#define NBA_REUSE_DATABLOCKS /* If you change below, update HANDLE_ALL_PORTS macro in lib/element.hh as well!! */ #define NBA_MAX_ELEM_NEXTS (4) diff --git a/include/nba/framework/elementgraph.hh b/include/nba/framework/elementgraph.hh index 8aecef3..0be5a4c 100644 --- a/include/nba/framework/elementgraph.hh +++ b/include/nba/framework/elementgraph.hh @@ -57,6 +57,7 @@ public: void scan_schedulable_elements(uint64_t loop_count); /* Scans and executes dispatch() handlers of offloadable elements. + * It is a shorthand version that ignores next_delay output arguments. * This fetches the GPU-processed batches and feed them into the graph * again. */ void scan_offloadable_elements(uint64_t loop_count); diff --git a/src/engines/cuda/computecontext.cc b/src/engines/cuda/computecontext.cc index 38eb0e9..af8bfc1 100644 --- a/src/engines/cuda/computecontext.cc +++ b/src/engines/cuda/computecontext.cc @@ -16,7 +16,7 @@ CUDAComputeContext::CUDAComputeContext(unsigned ctx_id, ComputeDevice *mother_de /* NOTE: Write-combined memory degrades performance to half... */ { type_name = "cuda"; - size_t io_base_size = 5 * 1024 * 1024; // TODO: read from config + size_t io_base_size = 4 * 1024 * 1024; // TODO: read from config cutilSafeCall(cudaStreamCreateWithFlags(&_stream, cudaStreamNonBlocking)); io_base_ring.init(NBA_MAX_IO_BASES, node_id, io_base_ring_buf); for (unsigned i = 0; i < NBA_MAX_IO_BASES; i++) { diff --git a/src/lib/computation.cc b/src/lib/computation.cc index 04f0ab4..412ef3b 100644 --- a/src/lib/computation.cc +++ b/src/lib/computation.cc @@ -185,6 +185,7 @@ void comp_thread_context::build_element_graph(const char* config_file) } } RTE_LOG(INFO, ELEM, "Number of linear groups: %lu\n", linear_groups.size()); + #ifdef NBA_REUSE_DATABLOCKS for (vector group : linear_groups) { Element *prev_el = nullptr; @@ -254,6 +255,7 @@ void comp_thread_context::build_element_graph(const char* config_file) } // TODO: 여기서 한번 더 체크해줘야 offloadable이 맨 마지막인 경우 제대로 처리될 것. } + #endif click_destroy_configuration(pi); fclose(input); elemgraph_lock->release(); diff --git a/src/lib/elementgraph.cc b/src/lib/elementgraph.cc index 8e191e5..0cd0bb9 100644 --- a/src/lib/elementgraph.cc +++ b/src/lib/elementgraph.cc @@ -39,6 +39,7 @@ ElementGraph::ElementGraph(comp_thread_context *ctx) input_elem = nullptr; assert(0 == rte_malloc_validate(ctx, NULL)); +#ifdef NBA_REUSE_DATABLOCKS struct rte_hash_parameters hparams; char namebuf[RTE_HASH_NAMESIZE]; sprintf(namebuf, "elemgraph@%u.%u:offl_actions", ctx->loc.node_id, ctx->loc.local_thread_idx); @@ -50,6 +51,9 @@ ElementGraph::ElementGraph(comp_thread_context *ctx) hparams.socket_id = ctx->loc.node_id; offl_actions = rte_hash_create(&hparams); assert(offl_actions != nullptr); +#else + offl_actions = nullptr; +#endif } void ElementGraph::send_offload_task_to_device(OffloadTask *task) @@ -69,45 +73,43 @@ void ElementGraph::send_offload_task_to_device(OffloadTask *task) /* Prepare to offload. */ if (task->state < TASK_PREPARED) { - //bool had_io_base = (task->io_base != INVALID_IO_BASE); - bool has_io_base = false; - if (task->io_base == INVALID_IO_BASE) { - task->io_base = cctx->alloc_io_base(); - has_io_base = (task->io_base != INVALID_IO_BASE); + /* In the GPU side, datablocks argument has only used + * datablocks in the beginning of the array (not sparsely). */ + int datablock_ids[NBA_MAX_DATABLOCKS]; + size_t num_db_used = task->elem->get_used_datablocks(datablock_ids); + for (unsigned k = 0; k < num_db_used; k++) { + int dbid = datablock_ids[k]; + task->datablocks.push_back(dbid); + task->dbid_h2d[dbid] = k; } - if (has_io_base) { - /* In the GPU side, datablocks argument has only used - * datablocks in the beginning of the array (not sparsely). */ - int datablock_ids[NBA_MAX_DATABLOCKS]; - size_t num_db_used = task->elem->get_used_datablocks(datablock_ids); - for (unsigned k = 0; k < num_db_used; k++) { - int dbid = datablock_ids[k]; - task->datablocks.push_back(dbid); - task->dbid_h2d[dbid] = k; - } - size_t num_batches = task->batches.size(); - /* As we reuse tasks between subsequent offloadables - * and only does in linear groups of elements, - * it is okay to check only the first batch. */ - if (task->batches[0]->datablock_states == nullptr) { - void *dbstates[num_batches]; - int bidx = 0; - assert(0 == rte_mempool_get_bulk(ctx->dbstate_pool, (void **) &dbstates, - num_batches)); - for (PacketBatch *batch : task->batches) { - batch->datablock_states = (struct datablock_tracker *) dbstates[bidx]; - bidx ++; - } + size_t num_batches = task->batches.size(); + /* As we reuse tasks between subsequent offloadables + * and only does in linear groups of elements, + * it is okay to check only the first batch. */ + if (task->batches[0]->datablock_states == nullptr) { + void *dbstates[num_batches]; + int bidx = 0; + assert(0 == rte_mempool_get_bulk(ctx->dbstate_pool, (void **) &dbstates, + num_batches)); + for (PacketBatch *batch : task->batches) { + batch->datablock_states = (struct datablock_tracker *) dbstates[bidx]; + bidx ++; } - task->offload_start = 0; + } + task->offload_start = 0; + if (task->io_base == INVALID_IO_BASE) { + task->io_base = cctx->alloc_io_base(); + bool has_io_base = (task->io_base != INVALID_IO_BASE); + assert(has_io_base); /* Calculate required buffer sizes, allocate them, and initialize them. * The mother buffer is statically allocated on start-up and here we * reserve regions inside it. */ - task->prepare_read_buffer(); - task->prepare_write_buffer(); - } /* endif(has_io_base) */ + } /* endif(!has_io_base) */ + + task->prepare_read_buffer(); + task->prepare_write_buffer(); task->state = TASK_PREPARED; } /* endif(!task.prepared) */ diff --git a/src/lib/io.cc b/src/lib/io.cc index 98f6a11..54d50f0 100644 --- a/src/lib/io.cc +++ b/src/lib/io.cc @@ -147,6 +147,7 @@ static void comp_offload_task_completion_cb(struct ev_loop *loop, struct ev_asyn uint64_t total_batch_size = 0; for (size_t b = 0, b_max = task->batches.size(); b < b_max; b ++) total_batch_size += task->batches[b]->count; + #ifdef NBA_REUSE_DATABLOCKS if (ctx->elem_graph->check_next_offloadable(task->elem)) { for (size_t b = 0, b_max = task->batches.size(); b < b_max; b ++) { task->batches[b]->compute_time += (uint64_t) @@ -160,6 +161,9 @@ static void comp_offload_task_completion_cb(struct ev_loop *loop, struct ev_asyn 0); /* This task is reused. We keep them intact. */ } else { + #else + { + #endif for (size_t b = 0, b_max = task->batches.size(); b < b_max; b ++) { task->batches[b]->compute_time += (uint64_t) ((float) task_cycles / total_batch_size