refs #6: Work-in-progress - backporting our internal datacopy-optimiz…

…ation branch. * Backported changes - Change NBA_MAX_PROCESSOR_TYPES to NBA_MAX_COPROCESSOR_TYPES to count coprocessors from zero instead one. - Add xmem argument to FixedRing to allow use of externally allocated memory area for the ring. - Add graph analysis for datablock reuses. It shows when to preproc/postproc datablocks during element graph initialization. (Not working yet...) . Also add check_preproc(), check_postproc(), check_postproc_all() methods to ElementGraph for later use. - Refactor and optimize scanning of schedulable elements. - Refactor OffloadableElement to make it schedulable for consistency. This moves task prepration codes into OffloadableElement from ElementGraph. - Remove support for the "sleepy" IO loop. * Excluded changes - Change the IO loop to not consume all received packets, but instead to call comp_process_batch() only once per iteration. Use the number of packets exceeding the computation batch size to reduce IO polling overheads. => Rejected since it actually reduces the performance about 10% with cpu-only configurations. * New changes - Move invocations to elemgraph->flush_* methods into ev_check event handler for brevity and reduced possibility of mistakes. * Performance impacts - There is no degradation of CPU-only and GPU-only performances compared to the previous commit.
ANLAB-KAIST · Sep 13, 2015 · 05a270a · 05a270a
1 parent ee69095
commit 05a270a
Show file tree

Hide file tree

Showing 24 changed files with 571 additions and 397 deletions.
diff --git a/elements/loadbalancers/CPUOnly.hh b/elements/loadbalancers/CPUOnly.hh
@@ -33,7 +33,7 @@ public:
 
     int process_batch(int input_port, PacketBatch *batch)
     {
-        anno_set(&batch->banno, NBA_BANNO_LB_DECISION, 0);
+        anno_set(&batch->banno, NBA_BANNO_LB_DECISION, -1);
         return 0;
     }
 

diff --git a/elements/loadbalancers/GPUOnly.hh b/elements/loadbalancers/GPUOnly.hh
@@ -33,7 +33,7 @@ public:
 
     int process_batch(int input_port, PacketBatch *batch)
     {
-        anno_set(&batch->banno, NBA_BANNO_LB_DECISION, 1);
+        anno_set(&batch->banno, NBA_BANNO_LB_DECISION, 0);
         return 0;
     }
 

diff --git a/elements/loadbalancers/LoadBalanceAIMD.hh b/elements/loadbalancers/LoadBalanceAIMD.hh
@@ -65,7 +65,7 @@ public:
         }
         choice = idx;
         assert(choice >= 0);
-        anno_set(&batch->banno, NBA_BANNO_LB_DECISION, choice);
+        anno_set(&batch->banno, NBA_BANNO_LB_DECISION, choice - 1);
         return 0;
     }
 

diff --git a/elements/loadbalancers/LoadBalanceAdaptiveGlobal.hh b/elements/loadbalancers/LoadBalanceAdaptiveGlobal.hh
@@ -52,7 +52,7 @@ public:
     {
         /* Generate a random number and find the interval where it belongs to. */
         int64_t x = uniform_dist(random_generator);
-        int _temp = (x > local_cpu_ratio);
+        int _temp = (x > local_cpu_ratio) - 1;
         anno_set(&batch->banno, NBA_BANNO_LB_DECISION, _temp);
         return 0;
     }

diff --git a/elements/loadbalancers/LoadBalanceAdaptiveMeasure.hh b/elements/loadbalancers/LoadBalanceAdaptiveMeasure.hh
@@ -77,10 +77,10 @@ public:
         // }}}
 #endif
         // {{{ Deterministic load balancer
-        int decision = 0;
+        int decision = -1;
         rep ++;
         if (offload) { // GPU-mode
-            decision = 1;
+            decision = 0;
             if (rep >= rep_limit_gpu) { // Change to CPU-mode
                 if (local_cpu_ratio == 0)
                     rep_limit = 0; // only once for sampling!
@@ -90,7 +90,7 @@ public:
                 offload = false;
             }
         } else {       // CPU-mode
-            decision = 0;
+            decision = -1;
             if (rep >= rep_limit_cpu) { // Change to GPU-mode
                 rep_limit = rep_limit_gpu;
                 rep = 0;

diff --git a/elements/loadbalancers/LoadBalanceByEnv.hh b/elements/loadbalancers/LoadBalanceByEnv.hh
@@ -34,9 +34,9 @@ public:
             lb_mode = const_cast<char*>("CPUOnly");
 
         if (!strcmp(lb_mode, "CPUOnly")) {
-            lb_decision = 0;
+            lb_decision = -1;
         } else if (!strcmp(lb_mode, "GPUOnly")) {
-            lb_decision = 1;
+            lb_decision = 0;
         } else {
             rte_panic("Unsupported load balancer mode: %s\n", lb_mode);
         }

diff --git a/elements/loadbalancers/LoadBalanceByWeight.hh b/elements/loadbalancers/LoadBalanceByWeight.hh
@@ -24,7 +24,7 @@ public:
     const char *port_count() const { return "1/1"; }
     int get_type() const { return SchedulableElement::get_type() | PerBatchElement::get_type(); }
 
-    int initialize() { 
+    int initialize() {
         /* Initialize random engines. */
         out_probs = std::vector<float>();
         uniform_dist = std::uniform_real_distribution<float>(0.0f, 1.0f);
@@ -85,7 +85,7 @@ public:
         }
         choice = idx;
         assert(choice >= 0);
-        anno_set(&batch->banno, NBA_BANNO_LB_DECISION, choice);
+        anno_set(&batch->banno, NBA_BANNO_LB_DECISION, choice - 1);
         return 0;
     }
 

diff --git a/elements/loadbalancers/LoadBalancePPC.hh b/elements/loadbalancers/LoadBalancePPC.hh
@@ -64,11 +64,11 @@ public:
 
     int process_batch(int input_port, PacketBatch *batch)
     {
-        int decision = 0;
+        int decision = -1;
         const float c = (float) local_cpu_ratio / LB_PPC_CPU_RATIO_MULTIPLIER;
         rep ++;
         if (offload) {
-            decision = 1;
+            decision = 0;
             if (rep == rep_limit) { // Change to CPU-mode
                 if (local_cpu_ratio == 0)
                     rep_limit = 0; // only once for sampling!
@@ -78,7 +78,7 @@ public:
                 offload = false;
             }
         } else {
-            decision = 0;
+            decision = -1;
             if (rep == rep_limit) { // Change to GPU-mode
                 rep_limit = ctx->num_coproc_ppdepth;
                 rep = 0;

diff --git a/elements/loadbalancers/LoadBalanceThruput.hh b/elements/loadbalancers/LoadBalanceThruput.hh
@@ -60,7 +60,7 @@ public:
     {
         /* Generate a random number and find the interval where it belongs to. */
         int64_t x = uniform_dist(random_generator);
-        int choice = (x > local_cpu_ratio);
+        int choice = (x > local_cpu_ratio) - 1;
         anno_set(&batch->banno, NBA_BANNO_LB_DECISION, choice);
         return 0;
     }

diff --git a/include/nba/core/queue.hh b/include/nba/core/queue.hh
@@ -157,10 +157,10 @@ public:
         /* Default constructor. You must explicitly call init() to use the instance. */
     }
 
-    FixedRing(size_t max_size, int numa_node = 0)
+    FixedRing(size_t max_size, int numa_node = 0, T *xmem = nullptr)
         : v_(nullptr), push_idx(0), pop_idx(0), count(0), max_size(max_size)
     {
-        init(max_size, numa_node);
+        init(max_size, numa_node, xmem);
     }
 
     virtual ~FixedRing()
@@ -169,12 +169,16 @@ public:
             rte_free(v_);
     }
 
-    void init(size_t max_size, int numa_node = 0)
+    void init(size_t max_size, int numa_node = 0, T *xmem = nullptr)
     {
         assert(max_size > 0);
         this->count = 0;
         this->max_size = max_size;
-        v_ = (T*) rte_malloc_socket("fixedring", sizeof(T) * max_size, 64, numa_node);
+        if (xmem == nullptr) {
+            v_ = (T*) rte_malloc_socket("fixedring", sizeof(T) * max_size, 64, numa_node);
+        } else {
+            v_ = xmem;
+        }
         assert(v_ != nullptr);
     }
 
@@ -186,6 +190,14 @@ public:
         count ++;
     }
 
+    void push_front(T t)
+    {
+        assert(count < max_size);
+        v_[pop_idx - 1] = t;
+        pop_idx = (pop_idx - 1) % max_size;
+        count ++;
+    }
+
     T front() const
     {
         if (!empty())

diff --git a/include/nba/element/element.hh b/include/nba/element/element.hh
@@ -19,6 +19,7 @@ namespace nba {
 
 /* Forward declarations. */
 class Element;
+class ElementGraph;
 class OffloadTask;
 class comp_thread_context;
 class ComputeContext;
@@ -58,6 +59,7 @@ private:
 
     private:
         friend class Element;
+        friend class GraphMetaData;
 
         int my_idx;
         Element *elem;
@@ -186,28 +188,42 @@ public:
     uint64_t _last_check_tick;
 };
 
-class OffloadableElement : virtual public Element {
+class OffloadableElement : virtual public SchedulableElement {
 
     friend class ElementGraph;
 
+private:
+    static const size_t MAX_FINBATCH_QLEN = NBA_MAX_COPROC_PPDEPTH * 16;
+
 public:
     int reuse_head_ref = 0;
     int reuse_tail_ref = 0;
 
-    OffloadableElement() : Element()
+    OffloadableElement() : SchedulableElement()
     {
         if (dummy_device) {
             auto ch = [this] (ComputeContext *ctx, struct resource_param *res) {
                 this->dummy_compute_handler(ctx, res);
             };
             offload_compute_handlers.insert({{"dummy", ch},});
         }
-        for (int i = 0; i < NBA_MAX_PROCESSOR_TYPES; i++)
+        for (int i = 0; i < NBA_MAX_COPROCESSOR_TYPES; i++)
             tasks[i] = nullptr;
+        finished_batches.init(MAX_FINBATCH_QLEN, -1, finished_batches_arrbuf);
     }
     virtual ~OffloadableElement() {}
-    int get_type() const { return ELEMTYPE_OFFLOADABLE; }
+    int get_type() const { return ELEMTYPE_OFFLOADABLE | ELEMTYPE_SCHEDULABLE; }
+
+    /** Begins offloading of the given batch. */
+    int offload(ElementGraph *mother, PacketBatch *in_batch, int input_port);
+
+    /** Stores the batches that are returned from offloading. */
+    int enqueue_batch(PacketBatch *batch);
+
+    /** Resumes the element graph processing using the enqueued batches. */
+    int dispatch(uint64_t loop_count, PacketBatch*& out_batch, uint64_t &next_delay);
 
+    /** Returns the list of supported devices for offloading. */
     virtual void get_supported_devices(std::vector<std::string> &device_names) const = 0;
     //virtual size_t get_desired_workgroup_size(const char *device_name) const = 0;
     virtual int get_offload_item_counter_dbid() const = 0;
@@ -222,7 +238,9 @@ public:
     std::unordered_map<std::string, offload_init_handler> offload_init_handlers;
 
 private:
-    OffloadTask *tasks[NBA_MAX_PROCESSOR_TYPES];
+    OffloadTask *tasks[NBA_MAX_COPROCESSOR_TYPES];
+    FixedRing<PacketBatch*, nullptr> finished_batches;
+    PacketBatch *finished_batches_arrbuf[MAX_FINBATCH_QLEN];
     void dummy_compute_handler(ComputeContext *ctx, struct resource_param *res);
 };
 

diff --git a/include/nba/framework/config.hh b/include/nba/framework/config.hh
@@ -10,7 +10,7 @@
 #define NBA_MAX_PORTS               (16)
 #define NBA_MAX_QUEUES_PER_PORT     (128)
 #define NBA_MAX_COPROCESSORS        (2)     // Max number of coprocessor devices
-#define NBA_MAX_PROCESSOR_TYPES     (2)     // Max number of device types (current: CPU and GPU)
+#define NBA_MAX_COPROCESSOR_TYPES   (1)     // Max number of coprocessor types
 
 #define NBA_MAX_PACKET_SIZE         (2048)
 #ifdef NBA_NO_HUGE
@@ -45,6 +45,8 @@
 #define NBA_OQ                      (true)  // Use output-queuing semantics when possible.
 #undef NBA_CPU_MICROBENCH                  // Enable support for PAPI library for microbenchmarks.
 
+#define NBA_REUSE_DATABLOCKS (1)
+
 /* If you change below, update HANDLE_ALL_PORTS macro in lib/element.hh as well!! */
 #define NBA_MAX_ELEM_NEXTS (4)
 
@@ -127,13 +129,6 @@ int get_ht_degree(void);
 //#define TEST_MINIMAL_L2FWD
 //#define TEST_RXONLY
 
-/* Inserts forced sleep when there is no packets received,
- * to reduce PCIe traffic.  The performance may increase or decrease
- * depending on the system configuration.
- * (see lib/io.cc)
- */
-//#define NBA_SLEEPY_IO
-
 #endif
 
 // vim: ts=8 sts=4 sw=4 et
diff --git a/include/nba/framework/elementgraph.hh b/include/nba/framework/elementgraph.hh
@@ -7,11 +7,19 @@
 #include <nba/element/element.hh>
 #include <nba/element/packetbatch.hh>
 #include <vector>
+#include <map>
 
 namespace nba {
 
 #define ROOT_ELEMENT (nullptr)
 
+enum ElementOffloadingActions : int {
+    ELEM_OFFL_NOTHING = 0,
+    ELEM_OFFL_PREPROC = 1,
+    ELEM_OFFL_POSTPROC = 2,
+    ELEM_OFFL_POSTPROC_FIN = 4,
+};
+
 class Element;
 class OffloadTask;
 class PacketBatch;
@@ -41,14 +49,12 @@ public:
     /* Tries to run all delayed batches. */
     void flush_delayed_batches();
 
-    /**
-     * A special case on completion of offloading.
-     * It begins DFS-based element graph traversing from the given
-     * offloaded element, with all results already calculated in the
-     * coprocessor thread.
-     */
-    void enqueue_postproc_batch(PacketBatch *batch, Element *offloaded_elem,
-                                int input_port);
+    /* Scan and execute schedulable elements. */
+    void scan_offloadable_elements();
+
+    bool check_preproc(OffloadableElement *oel, int dbid);
+    bool check_postproc(OffloadableElement *oel, int dbid);
+    bool check_postproc_all(OffloadableElement *oel);
 
     /**
      * Check if the given datablock (represented as a global ID) is reused
@@ -106,12 +112,19 @@ protected:
     comp_thread_context *ctx;
 
     FixedRing<PacketBatch *, nullptr> queue;
-    FixedRing<OffloadTask *, nullptr> ready_tasks[NBA_MAX_PROCESSOR_TYPES];
+    FixedRing<OffloadTask *, nullptr> ready_tasks[NBA_MAX_COPROCESSOR_TYPES];
     FixedRing<PacketBatch *, nullptr> delayed_batches;
 
 private:
+    std::map<std::pair<OffloadableElement*, int>, int> offl_actions;
+    std::set<OffloadableElement*> offl_fin;
+
     SchedulableElement *input_elem;
 
+    friend int io_loop(void *arg);
+    friend int OffloadableElement::offload(ElementGraph *mother, PacketBatch *in_batch, int input_port);
+    friend void comp_thread_context::build_element_graph(const char *config);
+
 };
 
 }