From d81c5f936b27440ede78cf2cd5c9acce8f45c820 Mon Sep 17 00:00:00 2001
From: Joongi Kim <joongi@an.kaist.ac.kr>
Date: Thu, 14 Jan 2016 21:36:17 +0900
Subject: [PATCH] refs #6: Try physically contiguous memory as GPU IO buffer,
 but failed.

 * Uses DPDK's memzone instead of CUDA's portable buffers.

 * Just keep it as an optional feature.
   Default is not to use it.
---
 include/nba/engines/cuda/computecontext.hh |  5 +++
 include/nba/engines/cuda/mempool.hh        | 16 +++++---
 include/nba/framework/computedevice.hh     |  2 +-
 src/engines/cuda/computecontext.cc         | 47 ++++++++++++++++++----
 src/lib/io.cc                              | 16 ++++----
 5 files changed, 64 insertions(+), 22 deletions(-)
diff --git a/include/nba/engines/cuda/computecontext.hh b/include/nba/engines/cuda/computecontext.hh
index 22a2aa6..5eeab56 100644
--- a/include/nba/engines/cuda/computecontext.hh
+++ b/include/nba/engines/cuda/computecontext.hh
@@ -12,6 +12,8 @@
 #include <nba/engines/cuda/mempool.hh>
 #include <nba/engines/cuda/utils.hh>
 
+struct rte_memzone;
+
 #define CUDA_MAX_KERNEL_ARGS    (16)
 
 namespace nba
@@ -92,6 +94,9 @@ private:
     CPUMemoryPool _cpu_mempool_in[NBA_MAX_IO_BASES];
     CPUMemoryPool _cpu_mempool_out[NBA_MAX_IO_BASES];
 
+    const struct rte_memzone *reserve_memory(ComputeDevice *mother);
+    const struct rte_memzone *mz;
+
     void *dummy_host_buf;
     memory_t dummy_dev_buf;
 
diff --git a/include/nba/engines/cuda/mempool.hh b/include/nba/engines/cuda/mempool.hh
index a62406d..aa8d4ac 100644
--- a/include/nba/engines/cuda/mempool.hh
+++ b/include/nba/engines/cuda/mempool.hh
@@ -55,7 +55,7 @@ private:
 class CPUMemoryPool : public MemoryPool
 {
 public:
-    CPUMemoryPool(int cuda_flags = 0) : MemoryPool(), base(NULL), flags(cuda_flags)
+    CPUMemoryPool(int cuda_flags = 0) : MemoryPool(), base(NULL), flags(cuda_flags), use_external(false)
     {
     }
 
@@ -72,11 +72,16 @@ public:
         return true;
     }
 
-    bool init_with_flags(unsigned long size, int flags)
+    bool init_with_flags(unsigned long size, void *ext_ptr, int flags)
     {
         this->max_size = size;
-        cutilSafeCall(cudaHostAlloc((void **) &base, size,
-                      flags));
+        if (ext_ptr != nullptr) {
+            base = ext_ptr;
+            use_external = true;
+        } else {
+            cutilSafeCall(cudaHostAlloc((void **) &base, size,
+                          flags));
+        }
         return true;
     }
 
@@ -91,7 +96,7 @@ public:
 
     void destroy()
     {
-        if (base != NULL)
+        if (base != NULL && !use_external)
             cudaFreeHost(base);
     }
 
@@ -103,6 +108,7 @@ public:
 protected:
     void *base;
     int flags;
+    bool use_external;
 };
 
 }
diff --git a/include/nba/framework/computedevice.hh b/include/nba/framework/computedevice.hh
index 8bcff86..b174704 100644
--- a/include/nba/framework/computedevice.hh
+++ b/include/nba/framework/computedevice.hh
@@ -104,10 +104,10 @@ public:
     struct ev_async *input_watcher;
     AsyncSemaphore available_sema;
 
-protected:
     const unsigned node_id;
     const unsigned device_id;
     const size_t num_contexts;
+protected:
     std::vector<ComputeContext *> contexts;
     Lock _lock;
 
diff --git a/src/engines/cuda/computecontext.cc b/src/engines/cuda/computecontext.cc
index af8bfc1..75fe6d1 100644
--- a/src/engines/cuda/computecontext.cc
+++ b/src/engines/cuda/computecontext.cc
@@ -1,5 +1,7 @@
 #include <nba/core/intrinsic.hh>
 #include <nba/engines/cuda/computecontext.hh>
+#include <rte_memzone.h>
+#include <unistd.h>
 
 using namespace std;
 using namespace nba;
@@ -10,27 +12,38 @@ struct cuda_event_context {
     void *user_arg;
 };
 
-CUDAComputeContext::CUDAComputeContext(unsigned ctx_id, ComputeDevice *mother_device)
- : ComputeContext(ctx_id, mother_device), checkbits_d(NULL), checkbits_h(NULL),
-   num_kernel_args(0)
+#define IO_BASE_SIZE (4 * 1024 * 1024)
+#undef USE_PHYS_CONT_MEMORY // performance degraded :(
+
+CUDAComputeContext::CUDAComputeContext(unsigned ctx_id, ComputeDevice *mother)
+ : ComputeContext(ctx_id, mother), checkbits_d(NULL), checkbits_h(NULL),
+   mz(reserve_memory(mother)), num_kernel_args(0)
    /* NOTE: Write-combined memory degrades performance to half... */
 {
     type_name = "cuda";
-    size_t io_base_size = 4 * 1024 * 1024; // TODO: read from config
+    size_t io_base_size = ALIGN_CEIL(IO_BASE_SIZE, getpagesize()); // TODO: read from config
     cutilSafeCall(cudaStreamCreateWithFlags(&_stream, cudaStreamNonBlocking));
     io_base_ring.init(NBA_MAX_IO_BASES, node_id, io_base_ring_buf);
     for (unsigned i = 0; i < NBA_MAX_IO_BASES; i++) {
         io_base_ring.push_back(i);
         _cuda_mempool_in[i].init(io_base_size);
         _cuda_mempool_out[i].init(io_base_size);
-        _cpu_mempool_in[i].init_with_flags(io_base_size, cudaHostAllocPortable);
-        _cpu_mempool_out[i].init_with_flags(io_base_size, cudaHostAllocPortable);
+        #ifdef USE_PHYS_CONT_MEMORY
+        void *base;
+        base = (void *) ((uintptr_t) mz->addr + i * io_base_size);
+        _cpu_mempool_in[i].init_with_flags(io_base_size, base, 0);
+        base = (void *) ((uintptr_t) mz->addr + i * io_base_size + NBA_MAX_IO_BASES * io_base_size);
+        _cpu_mempool_out[i].init_with_flags(io_base_size, base, 0);
+        #else
+        _cpu_mempool_in[i].init_with_flags(io_base_size, nullptr, cudaHostAllocPortable);
+        _cpu_mempool_out[i].init_with_flags(io_base_size, nullptr, cudaHostAllocPortable);
+        #endif
     }
     {
         void *t;
-        cutilSafeCall(cudaMalloc((void **) &t, 64));
+        cutilSafeCall(cudaMalloc((void **) &t, CACHE_LINE_SIZE));
         dummy_dev_buf.ptr = t;
-        cutilSafeCall(cudaHostAlloc((void **) &t, 64, cudaHostAllocPortable));
+        cutilSafeCall(cudaHostAlloc((void **) &t, CACHE_LINE_SIZE, cudaHostAllocPortable));
         dummy_host_buf = t;
     }
     cutilSafeCall(cudaHostAlloc((void **) &checkbits_h, MAX_BLOCKS, cudaHostAllocMapped));
@@ -40,6 +53,22 @@ CUDAComputeContext::CUDAComputeContext(unsigned ctx_id, ComputeDevice *mother_de
     memset(checkbits_h, 0, MAX_BLOCKS);
 }
 
+const struct rte_memzone *CUDAComputeContext::reserve_memory(ComputeDevice *mother)
+{
+#ifdef USE_PHYS_CONT_MEMORY
+    char namebuf[RTE_MEMZONE_NAMESIZE];
+    size_t io_base_size = ALIGN_CEIL(IO_BASE_SIZE, getpagesize());
+    snprintf(namebuf, RTE_MEMZONE_NAMESIZE, "cuda.io.%d:%d", mother->device_id, ctx_id);
+    const struct rte_memzone *_mz = rte_memzone_reserve(namebuf, 2 * io_base_size * NBA_MAX_IO_BASES,
+                                                        mother->node_id,
+                                                        RTE_MEMZONE_2MB | RTE_MEMZONE_SIZE_HINT_ONLY);
+    assert(_mz != nullptr);
+    return _mz;
+#else
+    return nullptr;
+#endif
+}
+
 CUDAComputeContext::~CUDAComputeContext()
 {
     cutilSafeCall(cudaStreamDestroy(_stream));
@@ -49,6 +78,8 @@ CUDAComputeContext::~CUDAComputeContext()
         _cpu_mempool_in[i].destroy();
         _cpu_mempool_out[i].destroy();
     }
+    if (mz != nullptr)
+        rte_memzone_free(mz);
     cutilSafeCall(cudaFreeHost(checkbits_h));
 }
 
diff --git a/src/lib/io.cc b/src/lib/io.cc
index 53f5fa9..233138c 100644
--- a/src/lib/io.cc
+++ b/src/lib/io.cc
@@ -813,9 +813,9 @@ int io_loop(void *arg)
     /* The IO thread runs in polling mode. */
     while (likely(!ctx->loop_broken)) {
         unsigned total_recv_cnt = 0;
-        #ifdef NBA_CPU_MICROBENCH
+        #ifdef NBA_CPU_MICROBENCH/*{{{*/
         PAPI_start(ctx->papi_evset_rx);
-        #endif
+        #endif/*}}}*/
         for (i = 0; i < ctx->num_hw_rx_queues; i++) {
 #ifdef NBA_RANDOM_PORT_ACCESS /*{{{*/
             /* Shuffle the RX queue list. */
@@ -939,14 +939,14 @@ int io_loop(void *arg)
 
         } // end of rxq scanning
         assert(total_recv_cnt <= NBA_MAX_IO_BATCH_SIZE * ctx->num_hw_rx_queues);
-        #ifdef NBA_CPU_MICROBENCH
+        #ifdef NBA_CPU_MICROBENCH/*{{{*/
         {
             long long ctr[5];
             PAPI_stop(ctx->papi_evset_rx, ctr);
             for (int i = 0; i < 5; i++)
                 ctx->papi_ctr_rx[i] += ctr[i];
         }
-        #endif
+        #endif/*}}}*/
 
         if (ctx->mode == IO_EMUL) {/*{{{*/
             while (!rte_ring_empty(ctx->drop_queue)) {
@@ -970,14 +970,14 @@ int io_loop(void *arg)
         /* Scan and execute schedulable elements. */
         ctx->comp_ctx->elem_graph->scan_schedulable_elements(loop_count);
 
-        #ifdef NBA_CPU_MICROBENCH
+        #ifdef NBA_CPU_MICROBENCH/*{{{*/
         {
             long long ctr[5];
             PAPI_stop(ctx->papi_evset_comp, ctr);
             for (int i = 0; i < 5; i++)
                 ctx->papi_ctr_comp[i] += ctr[i];
         }
-        #endif
+        #endif/*}}}*/
 
         while (!rte_ring_empty(ctx->new_packet_request_ring))/*{{{*/
         {
@@ -1013,9 +1013,9 @@ int io_loop(void *arg)
 
         /* Process received packets. */
         print_ratelimit("# received pkts from all rxq", total_recv_cnt, 10000);
-        #ifdef NBA_CPU_MICROBENCH
+        #ifdef NBA_CPU_MICROBENCH/*{{{*/
         PAPI_start(ctx->papi_evset_comp);
-        #endif
+        #endif/*}}}*/
         unsigned comp_batch_size = ctx->comp_ctx->num_combatch_size;
         for (unsigned pidx = 0; pidx < total_recv_cnt; pidx += comp_batch_size) {
             comp_process_batch(ctx, &pkts[pidx], RTE_MIN(comp_batch_size, total_recv_cnt - pidx), loop_count);