Skip to content

Commit

Permalink
refs #6: Work-in-progress - backporting our internal datacopy-optimiz…
Browse files Browse the repository at this point in the history
…ation branch.

 * Backported changes

   - Change NBA_MAX_PROCESSOR_TYPES to NBA_MAX_COPROCESSOR_TYPES
     to count coprocessors from zero instead one.

   - Add xmem argument to FixedRing to allow use of externally allocated
     memory area for the ring.

   - Add graph analysis for datablock reuses.
     It shows when to preproc/postproc datablocks during element graph
     initialization.  (Not working yet...)

     . Also add check_preproc(), check_postproc(), check_postproc_all()
       methods to ElementGraph for later use.

   - Refactor and optimize scanning of schedulable elements.

   - Refactor OffloadableElement to make it schedulable for
     consistency.  This moves task prepration codes into
     OffloadableElement from ElementGraph.

   - Remove support for the "sleepy" IO loop.

 * Excluded changes

   - Change the IO loop to not consume all received packets,
     but instead to call comp_process_batch() only once per iteration.
     Use the number of packets exceeding the computation batch size
     to reduce IO polling overheads.

     => Rejected since it actually reduces the performance about 10%
        with cpu-only configurations.

 * New changes

   - Move invocations to elemgraph->flush_* methods into ev_check event
     handler for brevity and reduced possibility of mistakes.

 * Performance impacts

   - There is no degradation of CPU-only and GPU-only performances
     compared to the previous commit.
  • Loading branch information
achimnol committed Sep 13, 2015
1 parent ee69095 commit 05a270a
Show file tree
Hide file tree
Showing 24 changed files with 571 additions and 397 deletions.
2 changes: 1 addition & 1 deletion elements/loadbalancers/CPUOnly.hh
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ public:

int process_batch(int input_port, PacketBatch *batch)
{
anno_set(&batch->banno, NBA_BANNO_LB_DECISION, 0);
anno_set(&batch->banno, NBA_BANNO_LB_DECISION, -1);
return 0;
}

Expand Down
2 changes: 1 addition & 1 deletion elements/loadbalancers/GPUOnly.hh
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ public:

int process_batch(int input_port, PacketBatch *batch)
{
anno_set(&batch->banno, NBA_BANNO_LB_DECISION, 1);
anno_set(&batch->banno, NBA_BANNO_LB_DECISION, 0);
return 0;
}

Expand Down
2 changes: 1 addition & 1 deletion elements/loadbalancers/LoadBalanceAIMD.hh
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ public:
}
choice = idx;
assert(choice >= 0);
anno_set(&batch->banno, NBA_BANNO_LB_DECISION, choice);
anno_set(&batch->banno, NBA_BANNO_LB_DECISION, choice - 1);
return 0;
}

Expand Down
2 changes: 1 addition & 1 deletion elements/loadbalancers/LoadBalanceAdaptiveGlobal.hh
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ public:
{
/* Generate a random number and find the interval where it belongs to. */
int64_t x = uniform_dist(random_generator);
int _temp = (x > local_cpu_ratio);
int _temp = (x > local_cpu_ratio) - 1;
anno_set(&batch->banno, NBA_BANNO_LB_DECISION, _temp);
return 0;
}
Expand Down
6 changes: 3 additions & 3 deletions elements/loadbalancers/LoadBalanceAdaptiveMeasure.hh
Original file line number Diff line number Diff line change
Expand Up @@ -77,10 +77,10 @@ public:
// }}}
#endif
// {{{ Deterministic load balancer
int decision = 0;
int decision = -1;
rep ++;
if (offload) { // GPU-mode
decision = 1;
decision = 0;
if (rep >= rep_limit_gpu) { // Change to CPU-mode
if (local_cpu_ratio == 0)
rep_limit = 0; // only once for sampling!
Expand All @@ -90,7 +90,7 @@ public:
offload = false;
}
} else { // CPU-mode
decision = 0;
decision = -1;
if (rep >= rep_limit_cpu) { // Change to GPU-mode
rep_limit = rep_limit_gpu;
rep = 0;
Expand Down
4 changes: 2 additions & 2 deletions elements/loadbalancers/LoadBalanceByEnv.hh
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@ public:
lb_mode = const_cast<char*>("CPUOnly");

if (!strcmp(lb_mode, "CPUOnly")) {
lb_decision = 0;
lb_decision = -1;
} else if (!strcmp(lb_mode, "GPUOnly")) {
lb_decision = 1;
lb_decision = 0;
} else {
rte_panic("Unsupported load balancer mode: %s\n", lb_mode);
}
Expand Down
4 changes: 2 additions & 2 deletions elements/loadbalancers/LoadBalanceByWeight.hh
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ public:
const char *port_count() const { return "1/1"; }
int get_type() const { return SchedulableElement::get_type() | PerBatchElement::get_type(); }

int initialize() {
int initialize() {
/* Initialize random engines. */
out_probs = std::vector<float>();
uniform_dist = std::uniform_real_distribution<float>(0.0f, 1.0f);
Expand Down Expand Up @@ -85,7 +85,7 @@ public:
}
choice = idx;
assert(choice >= 0);
anno_set(&batch->banno, NBA_BANNO_LB_DECISION, choice);
anno_set(&batch->banno, NBA_BANNO_LB_DECISION, choice - 1);
return 0;
}

Expand Down
6 changes: 3 additions & 3 deletions elements/loadbalancers/LoadBalancePPC.hh
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,11 @@ public:

int process_batch(int input_port, PacketBatch *batch)
{
int decision = 0;
int decision = -1;
const float c = (float) local_cpu_ratio / LB_PPC_CPU_RATIO_MULTIPLIER;
rep ++;
if (offload) {
decision = 1;
decision = 0;
if (rep == rep_limit) { // Change to CPU-mode
if (local_cpu_ratio == 0)
rep_limit = 0; // only once for sampling!
Expand All @@ -78,7 +78,7 @@ public:
offload = false;
}
} else {
decision = 0;
decision = -1;
if (rep == rep_limit) { // Change to GPU-mode
rep_limit = ctx->num_coproc_ppdepth;
rep = 0;
Expand Down
2 changes: 1 addition & 1 deletion elements/loadbalancers/LoadBalanceThruput.hh
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ public:
{
/* Generate a random number and find the interval where it belongs to. */
int64_t x = uniform_dist(random_generator);
int choice = (x > local_cpu_ratio);
int choice = (x > local_cpu_ratio) - 1;
anno_set(&batch->banno, NBA_BANNO_LB_DECISION, choice);
return 0;
}
Expand Down
20 changes: 16 additions & 4 deletions include/nba/core/queue.hh
Original file line number Diff line number Diff line change
Expand Up @@ -157,10 +157,10 @@ public:
/* Default constructor. You must explicitly call init() to use the instance. */
}

FixedRing(size_t max_size, int numa_node = 0)
FixedRing(size_t max_size, int numa_node = 0, T *xmem = nullptr)
: v_(nullptr), push_idx(0), pop_idx(0), count(0), max_size(max_size)
{
init(max_size, numa_node);
init(max_size, numa_node, xmem);
}

virtual ~FixedRing()
Expand All @@ -169,12 +169,16 @@ public:
rte_free(v_);
}

void init(size_t max_size, int numa_node = 0)
void init(size_t max_size, int numa_node = 0, T *xmem = nullptr)
{
assert(max_size > 0);
this->count = 0;
this->max_size = max_size;
v_ = (T*) rte_malloc_socket("fixedring", sizeof(T) * max_size, 64, numa_node);
if (xmem == nullptr) {
v_ = (T*) rte_malloc_socket("fixedring", sizeof(T) * max_size, 64, numa_node);
} else {
v_ = xmem;
}
assert(v_ != nullptr);
}

Expand All @@ -186,6 +190,14 @@ public:
count ++;
}

void push_front(T t)
{
assert(count < max_size);
v_[pop_idx - 1] = t;
pop_idx = (pop_idx - 1) % max_size;
count ++;
}

T front() const
{
if (!empty())
Expand Down
28 changes: 23 additions & 5 deletions include/nba/element/element.hh
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ namespace nba {

/* Forward declarations. */
class Element;
class ElementGraph;
class OffloadTask;
class comp_thread_context;
class ComputeContext;
Expand Down Expand Up @@ -58,6 +59,7 @@ private:

private:
friend class Element;
friend class GraphMetaData;

int my_idx;
Element *elem;
Expand Down Expand Up @@ -186,28 +188,42 @@ public:
uint64_t _last_check_tick;
};

class OffloadableElement : virtual public Element {
class OffloadableElement : virtual public SchedulableElement {

friend class ElementGraph;

private:
static const size_t MAX_FINBATCH_QLEN = NBA_MAX_COPROC_PPDEPTH * 16;

public:
int reuse_head_ref = 0;
int reuse_tail_ref = 0;

OffloadableElement() : Element()
OffloadableElement() : SchedulableElement()
{
if (dummy_device) {
auto ch = [this] (ComputeContext *ctx, struct resource_param *res) {
this->dummy_compute_handler(ctx, res);
};
offload_compute_handlers.insert({{"dummy", ch},});
}
for (int i = 0; i < NBA_MAX_PROCESSOR_TYPES; i++)
for (int i = 0; i < NBA_MAX_COPROCESSOR_TYPES; i++)
tasks[i] = nullptr;
finished_batches.init(MAX_FINBATCH_QLEN, -1, finished_batches_arrbuf);
}
virtual ~OffloadableElement() {}
int get_type() const { return ELEMTYPE_OFFLOADABLE; }
int get_type() const { return ELEMTYPE_OFFLOADABLE | ELEMTYPE_SCHEDULABLE; }

/** Begins offloading of the given batch. */
int offload(ElementGraph *mother, PacketBatch *in_batch, int input_port);

/** Stores the batches that are returned from offloading. */
int enqueue_batch(PacketBatch *batch);

/** Resumes the element graph processing using the enqueued batches. */
int dispatch(uint64_t loop_count, PacketBatch*& out_batch, uint64_t &next_delay);

/** Returns the list of supported devices for offloading. */
virtual void get_supported_devices(std::vector<std::string> &device_names) const = 0;
//virtual size_t get_desired_workgroup_size(const char *device_name) const = 0;
virtual int get_offload_item_counter_dbid() const = 0;
Expand All @@ -222,7 +238,9 @@ public:
std::unordered_map<std::string, offload_init_handler> offload_init_handlers;

private:
OffloadTask *tasks[NBA_MAX_PROCESSOR_TYPES];
OffloadTask *tasks[NBA_MAX_COPROCESSOR_TYPES];
FixedRing<PacketBatch*, nullptr> finished_batches;
PacketBatch *finished_batches_arrbuf[MAX_FINBATCH_QLEN];
void dummy_compute_handler(ComputeContext *ctx, struct resource_param *res);
};

Expand Down
11 changes: 3 additions & 8 deletions include/nba/framework/config.hh
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#define NBA_MAX_PORTS (16)
#define NBA_MAX_QUEUES_PER_PORT (128)
#define NBA_MAX_COPROCESSORS (2) // Max number of coprocessor devices
#define NBA_MAX_PROCESSOR_TYPES (2) // Max number of device types (current: CPU and GPU)
#define NBA_MAX_COPROCESSOR_TYPES (1) // Max number of coprocessor types

#define NBA_MAX_PACKET_SIZE (2048)
#ifdef NBA_NO_HUGE
Expand Down Expand Up @@ -45,6 +45,8 @@
#define NBA_OQ (true) // Use output-queuing semantics when possible.
#undef NBA_CPU_MICROBENCH // Enable support for PAPI library for microbenchmarks.

#define NBA_REUSE_DATABLOCKS (1)

/* If you change below, update HANDLE_ALL_PORTS macro in lib/element.hh as well!! */
#define NBA_MAX_ELEM_NEXTS (4)

Expand Down Expand Up @@ -127,13 +129,6 @@ int get_ht_degree(void);
//#define TEST_MINIMAL_L2FWD
//#define TEST_RXONLY

/* Inserts forced sleep when there is no packets received,
* to reduce PCIe traffic. The performance may increase or decrease
* depending on the system configuration.
* (see lib/io.cc)
*/
//#define NBA_SLEEPY_IO

#endif

// vim: ts=8 sts=4 sw=4 et
31 changes: 22 additions & 9 deletions include/nba/framework/elementgraph.hh
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,19 @@
#include <nba/element/element.hh>
#include <nba/element/packetbatch.hh>
#include <vector>
#include <map>

namespace nba {

#define ROOT_ELEMENT (nullptr)

enum ElementOffloadingActions : int {
ELEM_OFFL_NOTHING = 0,
ELEM_OFFL_PREPROC = 1,
ELEM_OFFL_POSTPROC = 2,
ELEM_OFFL_POSTPROC_FIN = 4,
};

class Element;
class OffloadTask;
class PacketBatch;
Expand Down Expand Up @@ -41,14 +49,12 @@ public:
/* Tries to run all delayed batches. */
void flush_delayed_batches();

/**
* A special case on completion of offloading.
* It begins DFS-based element graph traversing from the given
* offloaded element, with all results already calculated in the
* coprocessor thread.
*/
void enqueue_postproc_batch(PacketBatch *batch, Element *offloaded_elem,
int input_port);
/* Scan and execute schedulable elements. */
void scan_offloadable_elements();

bool check_preproc(OffloadableElement *oel, int dbid);
bool check_postproc(OffloadableElement *oel, int dbid);
bool check_postproc_all(OffloadableElement *oel);

/**
* Check if the given datablock (represented as a global ID) is reused
Expand Down Expand Up @@ -106,12 +112,19 @@ protected:
comp_thread_context *ctx;

FixedRing<PacketBatch *, nullptr> queue;
FixedRing<OffloadTask *, nullptr> ready_tasks[NBA_MAX_PROCESSOR_TYPES];
FixedRing<OffloadTask *, nullptr> ready_tasks[NBA_MAX_COPROCESSOR_TYPES];
FixedRing<PacketBatch *, nullptr> delayed_batches;

private:
std::map<std::pair<OffloadableElement*, int>, int> offl_actions;
std::set<OffloadableElement*> offl_fin;

SchedulableElement *input_elem;

friend int io_loop(void *arg);
friend int OffloadableElement::offload(ElementGraph *mother, PacketBatch *in_batch, int input_port);
friend void comp_thread_context::build_element_graph(const char *config);

};

}
Expand Down
Loading

0 comments on commit 05a270a

Please sign in to comment.