From edfe53924da0b8edeaa5410052c43b92172e015a Mon Sep 17 00:00:00 2001
From: Joongi Kim <joongi@an.kaist.ac.kr>
Date: Thu, 14 Jan 2016 00:31:40 +0900
Subject: [PATCH] refs #6: Fix some logical bugs and got twice performance!

 * Some operations were not being executed properly when checking
   TASK_PREPARED status in send_offload_task_to_device().

 * Still, it needs to be improved more....
---
 include/nba/framework/config.hh       |  2 +-
 include/nba/framework/elementgraph.hh |  1 +
 src/engines/cuda/computecontext.cc    |  2 +-
 src/lib/computation.cc                |  2 +
 src/lib/elementgraph.cc               | 66 ++++++++++++++-------------
 src/lib/io.cc                         |  4 ++
 6 files changed, 43 insertions(+), 34 deletions(-)
diff --git a/include/nba/framework/config.hh b/include/nba/framework/config.hh
index 0173dc7..9f9f9ed 100644
--- a/include/nba/framework/config.hh
+++ b/include/nba/framework/config.hh
@@ -57,7 +57,7 @@
 #define NBA_OQ                      (true)  // Use output-queuing semantics when possible.
 #undef NBA_CPU_MICROBENCH                  // Enable support for PAPI library for microbenchmarks.
 
-#define NBA_REUSE_DATABLOCKS (1)
+#define NBA_REUSE_DATABLOCKS
 
 /* If you change below, update HANDLE_ALL_PORTS macro in lib/element.hh as well!! */
 #define NBA_MAX_ELEM_NEXTS (4)
diff --git a/include/nba/framework/elementgraph.hh b/include/nba/framework/elementgraph.hh
index 8aecef3..0be5a4c 100644
--- a/include/nba/framework/elementgraph.hh
+++ b/include/nba/framework/elementgraph.hh
@@ -57,6 +57,7 @@ public:
     void scan_schedulable_elements(uint64_t loop_count);
 
     /* Scans and executes dispatch() handlers of offloadable elements.
+     * It is a shorthand version that ignores next_delay output arguments.
      * This fetches the GPU-processed batches and feed them into the graph
      * again. */
     void scan_offloadable_elements(uint64_t loop_count);
diff --git a/src/engines/cuda/computecontext.cc b/src/engines/cuda/computecontext.cc
index 38eb0e9..af8bfc1 100644
--- a/src/engines/cuda/computecontext.cc
+++ b/src/engines/cuda/computecontext.cc
@@ -16,7 +16,7 @@ CUDAComputeContext::CUDAComputeContext(unsigned ctx_id, ComputeDevice *mother_de
    /* NOTE: Write-combined memory degrades performance to half... */
 {
     type_name = "cuda";
-    size_t io_base_size = 5 * 1024 * 1024; // TODO: read from config
+    size_t io_base_size = 4 * 1024 * 1024; // TODO: read from config
     cutilSafeCall(cudaStreamCreateWithFlags(&_stream, cudaStreamNonBlocking));
     io_base_ring.init(NBA_MAX_IO_BASES, node_id, io_base_ring_buf);
     for (unsigned i = 0; i < NBA_MAX_IO_BASES; i++) {
diff --git a/src/lib/computation.cc b/src/lib/computation.cc
index 04f0ab4..412ef3b 100644
--- a/src/lib/computation.cc
+++ b/src/lib/computation.cc
@@ -185,6 +185,7 @@ void comp_thread_context::build_element_graph(const char* config_file)
         }
     }
     RTE_LOG(INFO, ELEM, "Number of linear groups: %lu\n", linear_groups.size());
+    #ifdef NBA_REUSE_DATABLOCKS
     for (vector<GraphMetaData *> group : linear_groups) {
 
         Element *prev_el = nullptr;
@@ -254,6 +255,7 @@ void comp_thread_context::build_element_graph(const char* config_file)
         }
         // TODO: 여기서 한번 더 체크해줘야 offloadable이 맨 마지막인 경우 제대로 처리될 것.
     }
+    #endif
     click_destroy_configuration(pi);
     fclose(input);
     elemgraph_lock->release();
diff --git a/src/lib/elementgraph.cc b/src/lib/elementgraph.cc
index 8e191e5..0cd0bb9 100644
--- a/src/lib/elementgraph.cc
+++ b/src/lib/elementgraph.cc
@@ -39,6 +39,7 @@ ElementGraph::ElementGraph(comp_thread_context *ctx)
     input_elem = nullptr;
     assert(0 == rte_malloc_validate(ctx, NULL));
 
+#ifdef NBA_REUSE_DATABLOCKS
     struct rte_hash_parameters hparams;
     char namebuf[RTE_HASH_NAMESIZE];
     sprintf(namebuf, "elemgraph@%u.%u:offl_actions", ctx->loc.node_id, ctx->loc.local_thread_idx);
@@ -50,6 +51,9 @@ ElementGraph::ElementGraph(comp_thread_context *ctx)
     hparams.socket_id = ctx->loc.node_id;
     offl_actions = rte_hash_create(&hparams);
     assert(offl_actions != nullptr);
+#else
+    offl_actions = nullptr;
+#endif
 }
 
 void ElementGraph::send_offload_task_to_device(OffloadTask *task)
@@ -69,45 +73,43 @@ void ElementGraph::send_offload_task_to_device(OffloadTask *task)
 
     /* Prepare to offload. */
     if (task->state < TASK_PREPARED) {
-        //bool had_io_base = (task->io_base != INVALID_IO_BASE);
-        bool has_io_base = false;
-        if (task->io_base == INVALID_IO_BASE) {
-            task->io_base = cctx->alloc_io_base();
-            has_io_base = (task->io_base != INVALID_IO_BASE);
+        /* In the GPU side, datablocks argument has only used
+         * datablocks in the beginning of the array (not sparsely). */
+        int datablock_ids[NBA_MAX_DATABLOCKS];
+        size_t num_db_used = task->elem->get_used_datablocks(datablock_ids);
+        for (unsigned k = 0; k < num_db_used; k++) {
+            int dbid = datablock_ids[k];
+            task->datablocks.push_back(dbid);
+            task->dbid_h2d[dbid] = k;
         }
-        if (has_io_base) {
-            /* In the GPU side, datablocks argument has only used
-             * datablocks in the beginning of the array (not sparsely). */
-            int datablock_ids[NBA_MAX_DATABLOCKS];
-            size_t num_db_used = task->elem->get_used_datablocks(datablock_ids);
-            for (unsigned k = 0; k < num_db_used; k++) {
-                int dbid = datablock_ids[k];
-                task->datablocks.push_back(dbid);
-                task->dbid_h2d[dbid] = k;
-            }
 
-            size_t num_batches = task->batches.size();
-            /* As we reuse tasks between subsequent offloadables
-             * and only does in linear groups of elements,
-             * it is okay to check only the first batch. */
-            if (task->batches[0]->datablock_states == nullptr) {
-                void *dbstates[num_batches];
-                int bidx = 0;
-                assert(0 == rte_mempool_get_bulk(ctx->dbstate_pool, (void **) &dbstates,
-                                                 num_batches));
-                for (PacketBatch *batch : task->batches) {
-                    batch->datablock_states = (struct datablock_tracker *) dbstates[bidx];
-                    bidx ++;
-                }
+        size_t num_batches = task->batches.size();
+        /* As we reuse tasks between subsequent offloadables
+         * and only does in linear groups of elements,
+         * it is okay to check only the first batch. */
+        if (task->batches[0]->datablock_states == nullptr) {
+            void *dbstates[num_batches];
+            int bidx = 0;
+            assert(0 == rte_mempool_get_bulk(ctx->dbstate_pool, (void **) &dbstates,
+                                             num_batches));
+            for (PacketBatch *batch : task->batches) {
+                batch->datablock_states = (struct datablock_tracker *) dbstates[bidx];
+                bidx ++;
             }
-            task->offload_start = 0;
+        }
+        task->offload_start = 0;
 
+        if (task->io_base == INVALID_IO_BASE) {
+            task->io_base = cctx->alloc_io_base();
+            bool has_io_base = (task->io_base != INVALID_IO_BASE);
+            assert(has_io_base);
             /* Calculate required buffer sizes, allocate them, and initialize them.
              * The mother buffer is statically allocated on start-up and here we
              * reserve regions inside it. */
-            task->prepare_read_buffer();
-            task->prepare_write_buffer();
-        } /* endif(has_io_base) */
+        } /* endif(!has_io_base) */
+
+        task->prepare_read_buffer();
+        task->prepare_write_buffer();
         task->state = TASK_PREPARED;
     } /* endif(!task.prepared) */
 
diff --git a/src/lib/io.cc b/src/lib/io.cc
index 98f6a11..54d50f0 100644
--- a/src/lib/io.cc
+++ b/src/lib/io.cc
@@ -147,6 +147,7 @@ static void comp_offload_task_completion_cb(struct ev_loop *loop, struct ev_asyn
         uint64_t total_batch_size = 0;
         for (size_t b = 0, b_max = task->batches.size(); b < b_max; b ++)
             total_batch_size += task->batches[b]->count;
+        #ifdef NBA_REUSE_DATABLOCKS
         if (ctx->elem_graph->check_next_offloadable(task->elem)) {
             for (size_t b = 0, b_max = task->batches.size(); b < b_max; b ++) {
                 task->batches[b]->compute_time += (uint64_t)
@@ -160,6 +161,9 @@ static void comp_offload_task_completion_cb(struct ev_loop *loop, struct ev_asyn
                                                   0);
             /* This task is reused. We keep them intact. */
         } else {
+        #else
+        {
+        #endif
             for (size_t b = 0, b_max = task->batches.size(); b < b_max; b ++) {
                 task->batches[b]->compute_time += (uint64_t)
                         ((float) task_cycles / total_batch_size