From 3e6202ec6c3acfda5dcf12f06ff4b3c544b65cc4 Mon Sep 17 00:00:00 2001
From: Joongi Kim <joongi@an.kaist.ac.kr>
Date: Thu, 28 May 2015 16:39:32 +0900
Subject: [PATCH] refs #6: Fix a memory allocation bug in GPU IO buffers.

 * We need to return the pointer "before" adding the newly allocated
   size (in lib/mempool.hh).

 * Now GPU does not crash, but it still hangs after processing the first
   offload task.
---
 engines/cuda/compat.hh  | 12 ++++--------
 engines/cuda/mempool.hh |  4 ++--
 lib/config.hh           |  2 +-
 lib/datablock.cc        |  1 -
 lib/datablock.hh        | 12 ++++--------
 lib/elementgraph.cc     |  4 ++--
 lib/mempool.hh          |  4 ++--
 lib/offloadtask.cc      | 26 +++++++-------------------
 8 files changed, 22 insertions(+), 43 deletions(-)

diff --git a/engines/cuda/compat.hh b/engines/cuda/compat.hh
index be608f9..5d56ab2 100644
--- a/engines/cuda/compat.hh
+++ b/engines/cuda/compat.hh
@@ -16,14 +16,10 @@ struct datablock_kernel_arg {
     void *buffer_bases_out[NBA_MAX_COPROC_PPDEPTH];
     uint32_t item_count_in[NBA_MAX_COPROC_PPDEPTH];
     uint32_t item_count_out[NBA_MAX_COPROC_PPDEPTH];
-    union {
-        uint16_t item_size_in;
-        uint16_t *item_sizes_in[NBA_MAX_COPROC_PPDEPTH];
-    };
-    union {
-        uint16_t item_size_out;
-        uint16_t *item_sizes_out[NBA_MAX_COPROC_PPDEPTH];
-    };
+    uint16_t item_size_in;
+    uint16_t *item_sizes_in[NBA_MAX_COPROC_PPDEPTH];
+    uint16_t item_size_out;
+    uint16_t *item_sizes_out[NBA_MAX_COPROC_PPDEPTH];
     uint16_t *item_offsets_in[NBA_MAX_COPROC_PPDEPTH];
     uint16_t *item_offsets_out[NBA_MAX_COPROC_PPDEPTH];
 };
diff --git a/engines/cuda/mempool.hh b/engines/cuda/mempool.hh
index 7e7af32..b86e94b 100644
--- a/engines/cuda/mempool.hh
+++ b/engines/cuda/mempool.hh
@@ -36,7 +36,7 @@ public:
         size_t offset;
         int ret = _alloc(size, &offset);
         if (ret == 0)
-            return (void *) ((uint8_t *) base_ + offset);
+            return (void *) ((uint8_t *) base_ + (uintptr_t) offset);
         return NULL;
     }
 
@@ -80,7 +80,7 @@ public:
         size_t offset;
         int ret = _alloc(size, &offset);
         if (ret == 0)
-            return (void *) ((uint8_t *) base_ + offset);
+            return (void *) ((uint8_t *) base_ + (uintptr_t) offset);
         return NULL;
     }
 
diff --git a/lib/config.hh b/lib/config.hh
index 3f35698..055c33c 100644
--- a/lib/config.hh
+++ b/lib/config.hh
@@ -38,7 +38,7 @@
 #define NBA_MAX_DATABLOCKS          (12)    // If too large (e.g., 64), batch_pool can not be allocated.
 
 #define NBA_OQ                      (true)  // Use output-queuing semantics when possible.
-#define NBA_CPU_MICROBENCH                  // Enable support for PAPI library for microbenchmarks.
+#undef NBA_CPU_MICROBENCH                  // Enable support for PAPI library for microbenchmarks.
 
 namespace nba {
 
diff --git a/lib/datablock.cc b/lib/datablock.cc
index b92e9c3..f8cf157 100644
--- a/lib/datablock.cc
+++ b/lib/datablock.cc
@@ -151,7 +151,6 @@ tuple<size_t, size_t> DataBlock::calc_write_buffer_size(PacketBatch *batch)
     case WRITE_FIXED_SEGMENTS: {
 
         num_write_items = batch->count;
-        //write_item_sizes.size = write_roi.length;
         write_buffer_size     = write_roi.length * num_write_items;
 
         break; }
diff --git a/lib/datablock.hh b/lib/datablock.hh
index fba3a7a..c27eac2 100644
--- a/lib/datablock.hh
+++ b/lib/datablock.hh
@@ -117,14 +117,10 @@ struct datablock_kernel_arg {
     void *buffer_bases_out[NBA_MAX_COPROC_PPDEPTH];
     uint32_t item_count_in[NBA_MAX_COPROC_PPDEPTH];
     uint32_t item_count_out[NBA_MAX_COPROC_PPDEPTH];
-    union {
-        uint16_t item_size_in;
-        uint16_t *item_sizes_in[NBA_MAX_COPROC_PPDEPTH];
-    };
-    union {
-        uint16_t item_size_out;
-        uint16_t *item_sizes_out[NBA_MAX_COPROC_PPDEPTH];
-    };
+    uint16_t item_size_in;
+    uint16_t *item_sizes_in[NBA_MAX_COPROC_PPDEPTH];
+    uint16_t item_size_out;
+    uint16_t *item_sizes_out[NBA_MAX_COPROC_PPDEPTH];
     uint16_t *item_offsets_in[NBA_MAX_COPROC_PPDEPTH];
     uint16_t *item_offsets_out[NBA_MAX_COPROC_PPDEPTH];
 }; // __attribute__((aligned(8)));
diff --git a/lib/elementgraph.cc b/lib/elementgraph.cc
index e233262..49ae82a 100644
--- a/lib/elementgraph.cc
+++ b/lib/elementgraph.cc
@@ -66,10 +66,10 @@ void ElementGraph::flush_offloaded_tasks()
                 /* Prepare to offload. */
                 cctx->state = ComputeContext::PREPARING;
                 cctx->currently_running_task = task;
-                // FIXME: dedicate a single cctx to each computation thread
-                //        (COPROC_CTX_PER_COMPTHREAD 설정이 1이면 괜찮지만 아예 1로 고정할 것.)
                 task->cctx = cctx;
 
+                /* In the GPU side, datablocks argument has only used
+                 * datablocks in the beginning of the array (not sparsely). */
                 int datablock_ids[NBA_MAX_DATABLOCKS];
                 size_t num_db_used = task->elem->get_used_datablocks(datablock_ids);
                 for (unsigned k = 0; k < num_db_used; k++) {
diff --git a/lib/mempool.hh b/lib/mempool.hh
index 8e76458..6c0038d 100644
--- a/lib/mempool.hh
+++ b/lib/mempool.hh
@@ -23,8 +23,8 @@ public:
     {
         if (curpos_ + size > max_size_)
             return -1;
-        curpos_ = __ALIGN(curpos_, 64);
-        if (start_offset)
+        /* IMPORTANT: We need to return the position before adding the new size. */
+        if (start_offset != nullptr)
             *start_offset = curpos_;
         curpos_ += size;
         curpos_ = __ALIGN(curpos_, 64);
diff --git a/lib/offloadtask.cc b/lib/offloadtask.cc
index 6c013fd..0ef9691 100644
--- a/lib/offloadtask.cc
+++ b/lib/offloadtask.cc
@@ -121,16 +121,16 @@ bool OffloadTask::copy_h2d()
     bool has_h2d_copies = false;
 
     /* Copy the datablock information for the first kernel argument. */
-    size_t dbarray_size = ALIGN(sizeof(struct datablock_kernel_arg) * NBA_MAX_DATABLOCKS, CACHE_LINE_SIZE);
+    size_t dbarray_size = ALIGN(sizeof(struct datablock_kernel_arg) * datablocks.size(), CACHE_LINE_SIZE);
     cctx->alloc_input_buffer(dbarray_size, (void **) &dbarray_h, &dbarray_d);
     assert(dbarray_h != nullptr);
     size_t itemszarray_size = 0;
 
     for (int dbid : datablocks) {
-        int b = 0;
         int dbid_d = dbid_h2d[dbid];
         dbarray_h[dbid_d].total_item_count_in  = 0;
         dbarray_h[dbid_d].total_item_count_out = 0;
+        assert(dbid_d < datablocks.size());
 
         DataBlock *db = comp_ctx->datablock_registry[dbid];
         struct read_roi_info rri;
@@ -138,6 +138,7 @@ bool OffloadTask::copy_h2d()
         db->get_read_roi(&rri);
         db->get_write_roi(&wri);
 
+        int b = 0;
         for (PacketBatch *batch : batches) {
             struct datablock_tracker *t = &batch->datablock_states[dbid];
 
@@ -204,13 +205,11 @@ bool OffloadTask::copy_h2d()
 
     /* Coalesced H2D data copy. */
     void *first_host_in_ptr = nullptr;
-    char printbuf[4096];
-    char *pb = &printbuf[0];
     int copies = 0;
     memory_t first_dev_in_ptr;
     size_t total_size = 0;
-    pb += (intptr_t) sprintf(pb, "Host-to-device copy:\n");
     for (int dbid : datablocks) {
+        int b = 0;
         for (PacketBatch *batch : batches) {
             struct datablock_tracker *t = &batch->datablock_states[dbid];
             if (t == nullptr || t->host_in_ptr == nullptr || t->in_count == 0 || t->in_size == 0) {
@@ -218,7 +217,6 @@ bool OffloadTask::copy_h2d()
                 if (first_host_in_ptr != nullptr) {
                     /* Discontinued copy. */
                     cctx->enqueue_memwrite_op(first_host_in_ptr, first_dev_in_ptr, 0, total_size);
-                    pb += (intptr_t) sprintf(pb, " [%d] %p - %p\n", copies++, first_host_in_ptr, (char*)first_host_in_ptr + (uintptr_t)total_size);
                     /* Reset. */
                     first_host_in_ptr = nullptr;
                     total_size        = 0;
@@ -232,21 +230,19 @@ bool OffloadTask::copy_h2d()
                 first_host_in_ptr = t->host_in_ptr;
                 first_dev_in_ptr  = t->dev_in_ptr;
             }
-            total_size += t->in_size;
+            total_size += ALIGN(t->in_size, CACHE_LINE_SIZE);
             #ifndef COALESC_COPY
             cctx->enqueue_memwrite_op(t->host_in_ptr, t->dev_in_ptr, 0, t->in_size);
             #endif
-            //printf("%p - %p\n", t->host_in_ptr, (void *)((char*) t->host_in_ptr + t->in_size));
             has_h2d_copies = true;
+            b++;
         }
     }
     #ifdef COALESC_COPY
     if (first_host_in_ptr != nullptr) {
         /* Finished copy. */
         cctx->enqueue_memwrite_op(first_host_in_ptr, first_dev_in_ptr, 0, total_size);
-        pb += (intptr_t) sprintf(pb, " [%d] %p - %p\n", copies++, first_host_in_ptr, (char*)first_host_in_ptr + (uintptr_t)total_size);
     }
-    //printf("%s", printbuf);
     #endif
     return has_h2d_copies;
 }
@@ -298,8 +294,6 @@ void OffloadTask::execute()
             }
             batch_id ++;
         }
-        //cctx->enqueue_memwrite_op(batch_ids_h, batch_ids_d, 0, sizeof(uint16_t) * all_item_count);
-        //cctx->enqueue_memwrite_op(item_ids_h, item_ids_d, 0, sizeof(uint16_t) * all_item_count);
         cctx->enqueue_memwrite_op(batch_ids_h, batch_ids_d, 0, ALIGN(sizeof(uint16_t) * all_item_count, CACHE_LINE_SIZE) * 2);
 
         cctx->clear_checkbits();
@@ -340,12 +334,9 @@ bool OffloadTask::copy_d2h()
     /* Coalesced D2H data copy. */
     bool has_d2h_copies = false;
     void *first_host_out_ptr = nullptr;
-    char printbuf[4096];
-    char *pb = &printbuf[0];
     int copies = 0;
     memory_t first_dev_out_ptr;
     size_t total_size = 0;
-    pb += (intptr_t) sprintf(pb, "Device-to-host copy:\n");
     for (int dbid : datablocks) {
         DataBlock *db = comp_ctx->datablock_registry[dbid];
         for (PacketBatch *batch : batches) {
@@ -355,7 +346,6 @@ bool OffloadTask::copy_d2h()
                 if (first_host_out_ptr != nullptr) {
                     /* Discontinued copy. */
                     cctx->enqueue_memread_op(first_host_out_ptr, first_dev_out_ptr, 0, total_size);
-                    pb += (intptr_t) sprintf(pb, " [%d] %p - %p\n", copies++, first_host_out_ptr, (char*)first_host_out_ptr + (uintptr_t)total_size);
                     /* Reset. */
                     first_host_out_ptr = nullptr;
                     total_size         = 0;
@@ -369,7 +359,7 @@ bool OffloadTask::copy_d2h()
                 first_host_out_ptr = t->host_out_ptr;
                 first_dev_out_ptr  = t->dev_out_ptr;
             }
-            total_size += t->out_size;
+            total_size += ALIGN(t->out_size, CACHE_LINE_SIZE);
             #ifndef COALESC_COPY
             cctx->enqueue_memread_op(t->host_out_ptr, t->dev_out_ptr, 0, t->out_size);
             #endif
@@ -380,9 +370,7 @@ bool OffloadTask::copy_d2h()
     if (first_host_out_ptr != nullptr) {
         /* Finished copy. */
         cctx->enqueue_memread_op(first_host_out_ptr, first_dev_out_ptr, 0, total_size);
-        pb += (intptr_t) sprintf(pb, " [%d] %p - %p\n", copies++, first_host_out_ptr, (char*)first_host_out_ptr + (uintptr_t)total_size);
     }
-    //printf("%s", printbuf);
     #endif
     return has_d2h_copies;
 }