From d81c5f936b27440ede78cf2cd5c9acce8f45c820 Mon Sep 17 00:00:00 2001 From: Joongi Kim Date: Thu, 14 Jan 2016 21:36:17 +0900 Subject: [PATCH] refs #6: Try physically contiguous memory as GPU IO buffer, but failed. * Uses DPDK's memzone instead of CUDA's portable buffers. * Just keep it as an optional feature. Default is not to use it. --- include/nba/engines/cuda/computecontext.hh | 5 +++ include/nba/engines/cuda/mempool.hh | 16 +++++--- include/nba/framework/computedevice.hh | 2 +- src/engines/cuda/computecontext.cc | 47 ++++++++++++++++++---- src/lib/io.cc | 16 ++++---- 5 files changed, 64 insertions(+), 22 deletions(-) diff --git a/include/nba/engines/cuda/computecontext.hh b/include/nba/engines/cuda/computecontext.hh index 22a2aa6..5eeab56 100644 --- a/include/nba/engines/cuda/computecontext.hh +++ b/include/nba/engines/cuda/computecontext.hh @@ -12,6 +12,8 @@ #include #include +struct rte_memzone; + #define CUDA_MAX_KERNEL_ARGS (16) namespace nba @@ -92,6 +94,9 @@ private: CPUMemoryPool _cpu_mempool_in[NBA_MAX_IO_BASES]; CPUMemoryPool _cpu_mempool_out[NBA_MAX_IO_BASES]; + const struct rte_memzone *reserve_memory(ComputeDevice *mother); + const struct rte_memzone *mz; + void *dummy_host_buf; memory_t dummy_dev_buf; diff --git a/include/nba/engines/cuda/mempool.hh b/include/nba/engines/cuda/mempool.hh index a62406d..aa8d4ac 100644 --- a/include/nba/engines/cuda/mempool.hh +++ b/include/nba/engines/cuda/mempool.hh @@ -55,7 +55,7 @@ private: class CPUMemoryPool : public MemoryPool { public: - CPUMemoryPool(int cuda_flags = 0) : MemoryPool(), base(NULL), flags(cuda_flags) + CPUMemoryPool(int cuda_flags = 0) : MemoryPool(), base(NULL), flags(cuda_flags), use_external(false) { } @@ -72,11 +72,16 @@ public: return true; } - bool init_with_flags(unsigned long size, int flags) + bool init_with_flags(unsigned long size, void *ext_ptr, int flags) { this->max_size = size; - cutilSafeCall(cudaHostAlloc((void **) &base, size, - flags)); + if (ext_ptr != nullptr) { + base = ext_ptr; + use_external = true; + } else { + cutilSafeCall(cudaHostAlloc((void **) &base, size, + flags)); + } return true; } @@ -91,7 +96,7 @@ public: void destroy() { - if (base != NULL) + if (base != NULL && !use_external) cudaFreeHost(base); } @@ -103,6 +108,7 @@ public: protected: void *base; int flags; + bool use_external; }; } diff --git a/include/nba/framework/computedevice.hh b/include/nba/framework/computedevice.hh index 8bcff86..b174704 100644 --- a/include/nba/framework/computedevice.hh +++ b/include/nba/framework/computedevice.hh @@ -104,10 +104,10 @@ public: struct ev_async *input_watcher; AsyncSemaphore available_sema; -protected: const unsigned node_id; const unsigned device_id; const size_t num_contexts; +protected: std::vector contexts; Lock _lock; diff --git a/src/engines/cuda/computecontext.cc b/src/engines/cuda/computecontext.cc index af8bfc1..75fe6d1 100644 --- a/src/engines/cuda/computecontext.cc +++ b/src/engines/cuda/computecontext.cc @@ -1,5 +1,7 @@ #include #include +#include +#include using namespace std; using namespace nba; @@ -10,27 +12,38 @@ struct cuda_event_context { void *user_arg; }; -CUDAComputeContext::CUDAComputeContext(unsigned ctx_id, ComputeDevice *mother_device) - : ComputeContext(ctx_id, mother_device), checkbits_d(NULL), checkbits_h(NULL), - num_kernel_args(0) +#define IO_BASE_SIZE (4 * 1024 * 1024) +#undef USE_PHYS_CONT_MEMORY // performance degraded :( + +CUDAComputeContext::CUDAComputeContext(unsigned ctx_id, ComputeDevice *mother) + : ComputeContext(ctx_id, mother), checkbits_d(NULL), checkbits_h(NULL), + mz(reserve_memory(mother)), num_kernel_args(0) /* NOTE: Write-combined memory degrades performance to half... */ { type_name = "cuda"; - size_t io_base_size = 4 * 1024 * 1024; // TODO: read from config + size_t io_base_size = ALIGN_CEIL(IO_BASE_SIZE, getpagesize()); // TODO: read from config cutilSafeCall(cudaStreamCreateWithFlags(&_stream, cudaStreamNonBlocking)); io_base_ring.init(NBA_MAX_IO_BASES, node_id, io_base_ring_buf); for (unsigned i = 0; i < NBA_MAX_IO_BASES; i++) { io_base_ring.push_back(i); _cuda_mempool_in[i].init(io_base_size); _cuda_mempool_out[i].init(io_base_size); - _cpu_mempool_in[i].init_with_flags(io_base_size, cudaHostAllocPortable); - _cpu_mempool_out[i].init_with_flags(io_base_size, cudaHostAllocPortable); + #ifdef USE_PHYS_CONT_MEMORY + void *base; + base = (void *) ((uintptr_t) mz->addr + i * io_base_size); + _cpu_mempool_in[i].init_with_flags(io_base_size, base, 0); + base = (void *) ((uintptr_t) mz->addr + i * io_base_size + NBA_MAX_IO_BASES * io_base_size); + _cpu_mempool_out[i].init_with_flags(io_base_size, base, 0); + #else + _cpu_mempool_in[i].init_with_flags(io_base_size, nullptr, cudaHostAllocPortable); + _cpu_mempool_out[i].init_with_flags(io_base_size, nullptr, cudaHostAllocPortable); + #endif } { void *t; - cutilSafeCall(cudaMalloc((void **) &t, 64)); + cutilSafeCall(cudaMalloc((void **) &t, CACHE_LINE_SIZE)); dummy_dev_buf.ptr = t; - cutilSafeCall(cudaHostAlloc((void **) &t, 64, cudaHostAllocPortable)); + cutilSafeCall(cudaHostAlloc((void **) &t, CACHE_LINE_SIZE, cudaHostAllocPortable)); dummy_host_buf = t; } cutilSafeCall(cudaHostAlloc((void **) &checkbits_h, MAX_BLOCKS, cudaHostAllocMapped)); @@ -40,6 +53,22 @@ CUDAComputeContext::CUDAComputeContext(unsigned ctx_id, ComputeDevice *mother_de memset(checkbits_h, 0, MAX_BLOCKS); } +const struct rte_memzone *CUDAComputeContext::reserve_memory(ComputeDevice *mother) +{ +#ifdef USE_PHYS_CONT_MEMORY + char namebuf[RTE_MEMZONE_NAMESIZE]; + size_t io_base_size = ALIGN_CEIL(IO_BASE_SIZE, getpagesize()); + snprintf(namebuf, RTE_MEMZONE_NAMESIZE, "cuda.io.%d:%d", mother->device_id, ctx_id); + const struct rte_memzone *_mz = rte_memzone_reserve(namebuf, 2 * io_base_size * NBA_MAX_IO_BASES, + mother->node_id, + RTE_MEMZONE_2MB | RTE_MEMZONE_SIZE_HINT_ONLY); + assert(_mz != nullptr); + return _mz; +#else + return nullptr; +#endif +} + CUDAComputeContext::~CUDAComputeContext() { cutilSafeCall(cudaStreamDestroy(_stream)); @@ -49,6 +78,8 @@ CUDAComputeContext::~CUDAComputeContext() _cpu_mempool_in[i].destroy(); _cpu_mempool_out[i].destroy(); } + if (mz != nullptr) + rte_memzone_free(mz); cutilSafeCall(cudaFreeHost(checkbits_h)); } diff --git a/src/lib/io.cc b/src/lib/io.cc index 53f5fa9..233138c 100644 --- a/src/lib/io.cc +++ b/src/lib/io.cc @@ -813,9 +813,9 @@ int io_loop(void *arg) /* The IO thread runs in polling mode. */ while (likely(!ctx->loop_broken)) { unsigned total_recv_cnt = 0; - #ifdef NBA_CPU_MICROBENCH + #ifdef NBA_CPU_MICROBENCH/*{{{*/ PAPI_start(ctx->papi_evset_rx); - #endif + #endif/*}}}*/ for (i = 0; i < ctx->num_hw_rx_queues; i++) { #ifdef NBA_RANDOM_PORT_ACCESS /*{{{*/ /* Shuffle the RX queue list. */ @@ -939,14 +939,14 @@ int io_loop(void *arg) } // end of rxq scanning assert(total_recv_cnt <= NBA_MAX_IO_BATCH_SIZE * ctx->num_hw_rx_queues); - #ifdef NBA_CPU_MICROBENCH + #ifdef NBA_CPU_MICROBENCH/*{{{*/ { long long ctr[5]; PAPI_stop(ctx->papi_evset_rx, ctr); for (int i = 0; i < 5; i++) ctx->papi_ctr_rx[i] += ctr[i]; } - #endif + #endif/*}}}*/ if (ctx->mode == IO_EMUL) {/*{{{*/ while (!rte_ring_empty(ctx->drop_queue)) { @@ -970,14 +970,14 @@ int io_loop(void *arg) /* Scan and execute schedulable elements. */ ctx->comp_ctx->elem_graph->scan_schedulable_elements(loop_count); - #ifdef NBA_CPU_MICROBENCH + #ifdef NBA_CPU_MICROBENCH/*{{{*/ { long long ctr[5]; PAPI_stop(ctx->papi_evset_comp, ctr); for (int i = 0; i < 5; i++) ctx->papi_ctr_comp[i] += ctr[i]; } - #endif + #endif/*}}}*/ while (!rte_ring_empty(ctx->new_packet_request_ring))/*{{{*/ { @@ -1013,9 +1013,9 @@ int io_loop(void *arg) /* Process received packets. */ print_ratelimit("# received pkts from all rxq", total_recv_cnt, 10000); - #ifdef NBA_CPU_MICROBENCH + #ifdef NBA_CPU_MICROBENCH/*{{{*/ PAPI_start(ctx->papi_evset_comp); - #endif + #endif/*}}}*/ unsigned comp_batch_size = ctx->comp_ctx->num_combatch_size; for (unsigned pidx = 0; pidx < total_recv_cnt; pidx += comp_batch_size) { comp_process_batch(ctx, &pkts[pidx], RTE_MIN(comp_batch_size, total_recv_cnt - pidx), loop_count);