Merge remote-tracking branch 'lqchen'

uwsampl · Aug 27, 2019 · aba616a · aba616a
2 parents e080d26 + 35c13be
commit aba616a
Show file tree

Hide file tree

Showing 27 changed files with 753 additions and 278 deletions.
diff --git a/.gitignore b/.gitignore
@@ -13,3 +13,5 @@ python/nexus/proto/
 
 cmake-build*/
 .clion.source.upload.marker
+.clangd/
+compile_commands.json
diff --git a/.gitmodules b/.gitmodules
@@ -14,8 +14,7 @@
 	path = frameworks/tensorflow
 	url = https://github.com/tensorflow/tensorflow.git
 	branch = master
-	shallow = true
 [submodule "frameworks/caffe2"]
 	path = frameworks/caffe2
-	url = git@github.com:abcdabcd987/caffe2-nexus.git
+	url = https://github.com/abcdabcd987/caffe2-nexus.git
 	branch = nexus
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -298,6 +298,13 @@ target_link_libraries(test_complex_query PRIVATE common)
 
 
 
+###### tools/bench_tfshare ######
+add_executable(bench_tfshare tools/bench_tfshare.cpp)
+target_compile_features(bench_tfshare PRIVATE cxx_std_11)
+target_link_libraries(bench_tfshare PRIVATE common backend_obj)
+
+
+
 # FIXME ###### tests ######
 # add_executable(runtest
 #         tests/cpp/scheduler/backend_delegate_test.cpp

diff --git a/INSTALL.md b/INSTALL.md
@@ -179,10 +179,11 @@ cd ../..
 
 # YAY!!! FINALLY US!!! nexus
 cd ..
-git clone git@github.com:abcdabcd987/nexus.git -b lqchen
+git clone https://github.com/uwsampl/nexus.git
 cd nexus
 git submodule update --init --recursive
 mkdir build
+cd build
 cmake .. -DCMAKE_CXX_COMPILER=g++-8.3 -DCMAKE_BUILD_TYPE=Debug -DUSE_GPU=ON -DCUDA_PATH=/usr/local/cuda-10.0 -DUSE_TENSORFLOW=ON -DUSE_CAFFE=OFF -DUSE_CAFFE2=OFF -DUSE_DARKNET=OFF
 make -j$(nproc)
 ```
diff --git a/README.md b/README.md
@@ -3,22 +3,15 @@ Nexus
 Nexus is a scalable and efficient serving system for DNN applications on GPU
 cluster.
 
-This is a joint research project between University of Washington and Microsoft Research.
+## Installation
 
-Compile and Deploy Nexus
-========================
+See [INSTALL.md](INSTALL.md) for details.
 
-Download nexus and sample models
---------------
+## SOSP 2019 Paper
 
-### Download Nexus
+* Check out the [Google Drive](https://drive.google.com/open?id=104UqrlNrfJoQnGdkxTQ56mfxSBFyJTcr) that contains the artifacts.
 
-Download nexus and framworks it supports from github
-```
-$ git clone --recursive https://github.com/uwsaml/nexus.git
-$ cd nexus
-$ git submodule update --init --recursive
-```
+## Deployment
 
 ### Download Model Zoo
 
@@ -36,66 +29,9 @@ $ mkdir nexus-models
 $ aws s3 sync s3://nexus-models nexus-models/
 ```
 
-Pre-requisites
---------------
-
-Recommended development environment on Ubuntu (>=16.04)
-
-### General dependencies
-```
-$ [sudo] apt-get install libboost-all-dev libgflags-dev libgoogle-glog-dev \
-libgtest-dev libyaml-cpp-dev libopenblas-dev libleveldb-dev liblmdb-dev \
-libhdf5-serial-dev
-```
-
-### Required libraries
-* protobuf >= 3.5.0
-* [grpc](https://github.com/grpc/grpc/blob/master/INSTALL.md) >= v1.4.x
-* OpenCV >= 3.0
-* bazel >= 0.10.0
-* CUDA >= 8.0
-* CUDNN >= 6.0
-
-### Pre-requsites for Deploy Nexus Service
-
-Nexus is able to be compiled from source, but if it is necessary to run Nexus service across multiple servers, we use the use docker swarm. So we recommand to deploy nexus by docker.
-
-To build nexus docker images and deploy Nexus, following packages are required
-* [CUDA 8.0](https://developer.nvidia.com/cuda-80-ga2-download-archive)
-* [docker](https://docs.docker.com/install/linux/docker-ce/ubuntu/)(>=1.12)
-* [nvidia-docker](https://github.com/NVIDIA/nvidia-docker)
-
-Set Config for Nexus
---------------------
-
-There is a file names Makefile.config in the nexus folder. You can config the CUDA and CUDNN library path in the config file. $(CUDA_PATH) should be set as CUDA/library/path, and $(CUDNN_PATH) should be set as CUDNN/library/path.
-
-Other varaibles such as $(USE_CAFFE) decide if the corresponding frameworks nexus need to support. Nexus is not able to support caffe and caffe2 at the same time.
-
-
 Local Tests without Docker
 --------------------------
 
-### Compile from source
-
-Nexus can be compiled from source with make command
-```
-$ cd nexus
-$ cp frameworks/tf_configure.bazelrc frameworks/tensorflow/.tf_configure.bazelrc
-$ cd frameworks/tensorflow
-$ bazel --output_base=../../build/tensorflow build --config=opt //tensorflow:libtensorflow.so //tensorflow:libtensorflow_cc.so //tensorflow:libtensorflow_framework.so //tensorflow:install_headers
-$ cd ../..
-$ make all
-```
-
-In the Nexus repo, we provide a few sample applications located at `nexus/apps`.
-Go to the directory at each application, e.g., nexus/apps/simple_app/, you can
-compile the application by
-```
-$ cd nexus/apps/simple_app
-$ make
-```
-
 ### Local Tests
 
 There are some commands to test these local sample applications. We will use simple_app as an example of these applications.
@@ -128,8 +64,8 @@ Since we recommand to use docker, there are four docker images that needs to be
 First you need to build the base docker image that installs all dependent libraries required for Nexus.
 
 Following docker build commands contain -t and -f options. -t option is followed
-by name and optionally a tag in the ‘name:tag’ format. -f option is followed by
-name of the Dockerfile (Default is ‘PATH/Dockerfile’).
+by name and optionally a tag in the `name:tag` format. -f option is followed by
+name of the Dockerfile (Default is `PATH/Dockerfile`).
 
 ```
 $ cd nexus/dockerfiles

diff --git a/python/nexus/async_client.py b/python/nexus/async_client.py
@@ -68,6 +68,19 @@ def _prepare_req(self, img):
         self._req_id += 1
         return req
 
+    def request_with_hack_filename(self, filename):
+        req = npb.RequestProto()
+        req.user_id = self._user_id
+        req.req_id = self._req_id
+        req.input.data_type = npb.DT_IMAGE
+        req.input.image.hack_filename = filename
+        req.input.image.format = npb.ImageProto.JPEG
+        req.input.image.color = True
+        self._req_id += 1
+
+        msg = self._prepare_message(MSG_USER_REQUEST, req)
+        return self._do_request(req, msg)
+
     def _prepare_message(self, msg_type, request):
         body = request.SerializeToString()
         header = struct.pack('!LLL', MAGIC_NUMBER, msg_type, len(body))

diff --git a/src/nexus/backend/backend_main.cpp b/src/nexus/backend/backend_main.cpp
@@ -9,16 +9,19 @@
 #include <gflags/gflags.h>
 #include <glog/logging.h>
 
+#include "nexus/backend/backend_server.h"
 #include "nexus/common/config.h"
+#include "nexus/common/image.h"
 #include "nexus/common/util.h"
-#include "nexus/backend/backend_server.h"
+#include "nexus/proto/nnquery.pb.h"
 
 using namespace nexus;
 using namespace nexus::backend;
 
 DEFINE_string(port, std::to_string(BACKEND_DEFAULT_PORT), "server port");
 DEFINE_string(rpc_port, std::to_string(BACKEND_DEFAULT_RPC_PORT), "RPC port");
-DEFINE_string(sch_addr, "127.0.0.1", "scheduler IP address "
+DEFINE_string(sch_addr, "127.0.0.1",
+              "scheduler IP address "
               "(use default port 10001 if no port specified)");
 DEFINE_int32(gpu, 0, "gpu device ID (default: 0)");
 DEFINE_uint64(num_workers, 0, "number of workers (default: 0)");
@@ -54,7 +57,7 @@ void sigint_handler(int _sig) {
   std::exit(0);
 }
 
-int main(int argc, char** argv) {
+int main(int argc, char **argv) {
   struct sigaction sig_handle;
   sig_handle.sa_handler = sigint_handler;
   sigemptyset(&sig_handle.sa_mask);
@@ -68,13 +71,19 @@ int main(int argc, char** argv) {
   // Setup backtrace on segfault
   google::InstallFailureSignalHandler();
   // Decide server IP address
-  LOG(INFO) << "Backend server: port " << FLAGS_port << ", rpc port " <<
-      FLAGS_rpc_port << ", workers " << FLAGS_num_workers << ", gpu " <<
-      FLAGS_gpu;
+  LOG(INFO) << "Backend server: port " << FLAGS_port << ", rpc port "
+            << FLAGS_rpc_port << ", workers " << FLAGS_num_workers << ", gpu "
+            << FLAGS_gpu;
+  // Initialize _Hack_Images
+  {
+    ImageProto image;
+    image.set_hack_filename("__init_Hack_Images");
+    (void)_Hack_DecodeImageByFilename(image, ChannelOrder::CO_BGR);
+  }
   // Create the backend server
   std::vector<int> cores = ParseCores(FLAGS_cores);
-  BackendServer server(FLAGS_port, FLAGS_rpc_port, FLAGS_sch_addr,
-                       FLAGS_gpu, FLAGS_num_workers, cores);
+  BackendServer server(FLAGS_port, FLAGS_rpc_port, FLAGS_sch_addr, FLAGS_gpu,
+                       FLAGS_num_workers, cores);
   server_ptr = &server;
   server.Run();
   return 0;

diff --git a/src/nexus/backend/backend_server.cpp b/src/nexus/backend/backend_server.cpp
@@ -454,6 +454,7 @@ void BackendServer::Register() {
   GPUDevice* gpu_device = DeviceManager::Singleton().GetGPUDevice(
       gpu_id_);
   request.set_gpu_device_name(gpu_device->device_name());
+  request.set_gpu_uuid(gpu_device->uuid());
   request.set_gpu_available_memory(gpu_device->FreeMemory());
 
   while (true) {

diff --git a/src/nexus/backend/model_exec.cpp b/src/nexus/backend/model_exec.cpp
@@ -28,7 +28,7 @@ ModelExecutor::ModelExecutor(int gpu_id, const ModelInstanceConfig& config,
 #ifdef USE_GPU
   auto gpu_device = DeviceManager::Singleton().GetGPUDevice(gpu_id);
   profile_ = ModelDatabase::Singleton().GetModelProfile(
-      gpu_device->device_name(), model_->profile_id());
+      gpu_device->device_name(), gpu_device->uuid(), model_->profile_id());
 #endif
   req_counter_ = MetricRegistry::Singleton().CreateIntervalCounter(
       FLAGS_backend_count_interval);

diff --git a/src/nexus/backend/tensorflow_model.cpp b/src/nexus/backend/tensorflow_model.cpp
@@ -25,16 +25,16 @@ TensorflowModel::TensorflowModel(int gpu_id, const ModelInstanceConfig& config):
   gpu_opt->set_visible_device_list(std::to_string(gpu_id));
   gpu_opt->set_allocator_type("BFC");
   LOG(INFO) << "model memory usage: " << config.memory_usage() << " B";
-  gpu_opt->set_allow_growth(true);
-//  if (config.memory_usage() > 0) {
-//    double memory_usage = config.memory_usage();
-//    LOG(INFO) << "model memory usage: " << memory_usage << " B";
-//    gpu_opt->set_per_process_gpu_memory_fraction(
-//        memory_usage / gpu_device_->TotalMemory());
-//    gpu_opt->set_allow_growth(false);
-//  } else {
-//    gpu_opt->set_allow_growth(true);
-//  }
+//  gpu_opt->set_allow_growth(true);
+  if (config.memory_usage() > 0) {
+    double memory_usage = config.memory_usage();
+    LOG(INFO) << "model memory usage: " << memory_usage << " B";
+    gpu_opt->set_per_process_gpu_memory_fraction(
+        memory_usage / gpu_device_->TotalMemory());
+    gpu_opt->set_allow_growth(false);
+  } else {
+    gpu_opt->set_allow_growth(true);
+  }
   (*cpu_option_.config.mutable_device_count())["GPU"] = 0;
 
   // Init session and load model
@@ -339,6 +339,8 @@ void TensorflowModel::MarshalDetectionResult(
   }
   for (int i = 0; i < num_boxes; ++i) {
     auto record = result->add_output();
+    if (FLAGS_hack_reply_omit_output)
+      continue;
     int class_id = static_cast<int>(classes[i]);
     for (auto field : output_fields) {
       if (field == "rect") {

diff --git a/src/nexus/backend/utils.cpp b/src/nexus/backend/utils.cpp
@@ -1,15 +1,19 @@
 #include <fstream>
+#include <gflags/gflags.h>
 #include <glog/logging.h>
 #include <unordered_set>
 
-#include "nexus/common/util.h"
 #include "nexus/backend/utils.h"
+#include "nexus/common/util.h"
+
+DEFINE_bool(hack_reply_omit_output, false,
+            "HACK: omit output field in ReplyProto");
 
 namespace nexus {
 namespace backend {
 
-void LoadClassnames(const std::string& filepath,
-                    std::unordered_map<int, std::string>* classnames) {
+void LoadClassnames(const std::string &filepath,
+                    std::unordered_map<int, std::string> *classnames) {
   std::ifstream infile(filepath);
   CHECK(infile.good()) << "Classname file " << filepath << " doesn't exist";
   std::string line;
@@ -28,13 +32,13 @@ void LoadClassnames(const std::string& filepath,
 }
 
 void PostprocessClassification(
-    const QueryProto& query, const float* prob, size_t nprobs,
-    QueryResultProto* result,
-    const std::unordered_map<int, std::string>* classnames) {
+    const QueryProto &query, const float *prob, size_t nprobs,
+    QueryResultProto *result,
+    const std::unordered_map<int, std::string> *classnames) {
   // TODO: handle top k and threshold in the query
   if (classnames != nullptr) {
-    CHECK_EQ(classnames->size(), nprobs) << "Mismatch between number of " <<
-        "class names and number of outputs";
+    CHECK_EQ(classnames->size(), nprobs) << "Mismatch between number of "
+                                         << "class names and number of outputs";
   }
   std::unordered_set<std::string> output_fields(query.output_field().begin(),
                                                 query.output_field().end());
@@ -45,7 +49,7 @@ void PostprocessClassification(
   }
   float max_prob = 0.;
   int max_idx = -1;
-  for (int i = 0; i < (int) nprobs; ++i) {
+  for (int i = 0; i < (int)nprobs; ++i) {
     float p = prob[i];
     if (p > max_prob) {
       max_prob = p;
@@ -54,6 +58,8 @@ void PostprocessClassification(
   }
   if (max_idx > -1) {
     auto record = result->add_output();
+    if (FLAGS_hack_reply_omit_output)
+      return;
     for (auto field : output_fields) {
       if (field == "class_id") {
         auto value = record->add_named_value();

diff --git a/src/nexus/common/device.cpp b/src/nexus/common/device.cpp
@@ -4,6 +4,9 @@
 namespace nexus {
 
 #ifdef USE_GPU
+
+DEFINE_bool(generic_profile, false, "Use the generic profile for all GPUs of the same model instead of using profiles for each GPU card. (Applicable to Backend only)");
+
 GPUDevice::GPUDevice(int gpu_id) :
         Device(kGPU), gpu_id_(gpu_id) {
     std::stringstream ss;
@@ -15,8 +18,25 @@ GPUDevice::GPUDevice(int gpu_id) :
     device_name_.assign(prop.name, strlen(prop.name));
     std::replace(device_name_.begin(), device_name_.end(), ' ', '_');
     total_memory_ = prop.totalGlobalMem;
-    LOG(INFO) << "GPU " << gpu_id << " " << device_name_ << ": total memory " <<
-              total_memory_ / 1024. / 1024. / 1024. << "GB";
+
+    if (FLAGS_generic_profile) {
+      uuid_ = "generic";
+    } else {
+      auto *u = reinterpret_cast<unsigned char *>(&prop.uuid);
+      char uuid_hex[37] = {};
+      sprintf(uuid_hex,
+              "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
+              u[0], u[1], u[2], u[3],
+              u[4], u[5],
+              u[6], u[7],
+              u[8], u[9],
+              u[10], u[11], u[12], u[13], u[14], u[15]);
+      uuid_ = uuid_hex;
+    }
+
+    LOG(INFO) << "GPU " << gpu_id << " " << device_name_
+              << "(" << uuid_ << ")"
+              << ": total memory " << total_memory_ / 1024. / 1024. / 1024. << "GB";
 }
 
 void *GPUDevice::Allocate(size_t nbytes) {

diff --git a/src/nexus/common/device.h b/src/nexus/common/device.h
@@ -79,7 +79,9 @@ class GPUDevice : public Device {
 
   std::string name() const final { return name_; }
 
-  std::string device_name() { return device_name_; }
+  std::string device_name() const { return device_name_; }
+
+  std::string uuid() const { return uuid_; }
 
   size_t FreeMemory() const;
 
@@ -93,6 +95,7 @@ class GPUDevice : public Device {
   int gpu_id_;
   std::string name_;
   std::string device_name_;
+  std::string uuid_;
   size_t total_memory_;
 };