Add BLS sync (#68)

* Move InferenceRequest implementation to C++ * Add BLS sync * Remove the support for GPU tensors by default * Fix error handling for BLS requests * Refactor * Fix response deletion if there is an error * Add get_output_tensor_by_name to Python backend utils * Minor improvements and bug fixes * Minor bug fix in releasing the bls inference responses
triton-inference-server · Aug 6, 2021 · 4c01991 · 4c01991
1 parent 884ff52
commit 4c01991
Show file tree

Hide file tree

Showing 16 changed files with 607 additions and 164 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,7 @@
 /build
 /.vscode
 *.so
+builddir
 
 ### Python ###
 # Byte-compiled / optimized / DLL files

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -38,6 +38,7 @@ project(tritonpythonbackend LANGUAGES C CXX)
 # because python backend does not need to access CUDA or GPUs
 #
 option(TRITON_ENABLE_GPU "Enable GPU support in backend" OFF)
+option(TRITON_ENABLE_GPU_TENSORS "Allow GPU input and output tensors" OFF)
 option(TRITON_ENABLE_STATS "Include statistics collections in backend" ON)
 
 set(TRITON_BACKEND_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/backend repo")

diff --git a/src/infer_request.cc b/src/infer_request.cc
@@ -25,7 +25,7 @@
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "infer_request.h"
-
+#include "pb_utils.h"
 #ifdef TRITON_PB_STUB
 #include "infer_response.h"
 #include "pb_stub.h"
@@ -148,4 +148,85 @@ InferRequest::LoadFromSharedMemory(
       requested_output_names, model_name, request->model_version);
 }
 
+#ifdef TRITON_PB_STUB
+std::unique_ptr<InferResponse>
+InferRequest::Exec()
+{
+  ResponseBatch* response_batch = nullptr;
+  bool responses_is_set = false;
+  std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+  std::unique_ptr<SharedMemory>& shm_pool = stub->GetSharedMemory();
+  IPCMessage* ipc_message = stub->GetIPCMessage();
+  try {
+    ipc_message->stub_command = PYTHONSTUB_CommandType::PYTHONSTUB_Execute;
+
+    ExecuteArgs* exec_args;
+    shm_pool->Map(
+        (char**)&exec_args, sizeof(ExecuteArgs), ipc_message->stub_args);
+
+    RequestBatch* request_batch;
+    shm_pool->Map(
+        (char**)&request_batch, sizeof(RequestBatch), exec_args->request_batch);
+    request_batch->batch_size = 1;
+
+    Request* request;
+    shm_pool->Map((char**)&request, sizeof(Request), request_batch->requests);
+
+    request->requested_input_count = this->Inputs().size();
+    Tensor* tensors;
+    shm_pool->Map(
+        (char**)&tensors, sizeof(Tensor) * request->requested_input_count,
+        request->inputs);
+
+    // TODO: Custom handling for GPU
+    size_t i = 0;
+    for (auto& input_tensor : this->Inputs()) {
+      input_tensor->SaveToSharedMemory(shm_pool, &tensors[i]);
+      i += 1;
+    }
+    this->SaveToSharedMemory(shm_pool, request);
+
+    ipc_message->stub_command =
+        PYTHONSTUB_CommandType::PYTHONSTUB_InferExecRequest;
+    stub->NotifyParent();
+    stub->WaitForNotification();
+
+    ipc_message->stub_command = PYTHONSTUB_CommandType::PYTHONSTUB_Execute;
+
+    shm_pool->MapOffset((char**)&response_batch, exec_args->response_batch);
+    responses_is_set = true;
+
+    if (response_batch->has_error) {
+      if (response_batch->is_error_set) {
+        char* err_string;
+        LoadStringFromSharedMemory(shm_pool, response_batch->error, err_string);
+        return std::make_unique<InferResponse>(
+            std::vector<std::shared_ptr<PbTensor>>{},
+            std::make_shared<PbError>(err_string));
+      } else {
+        return std::make_unique<InferResponse>(
+            std::vector<std::shared_ptr<PbTensor>>{},
+            std::make_shared<PbError>(
+                "An error occurred while performing BLS request."));
+      }
+    }
+  }
+  catch (const PythonBackendException& pb_exception) {
+    return std::make_unique<InferResponse>(
+        std::vector<std::shared_ptr<PbTensor>>{},
+        std::make_shared<PbError>(pb_exception.what()));
+  }
+
+  if (responses_is_set) {
+    return InferResponse::LoadFromSharedMemory(
+        shm_pool, response_batch->responses);
+  } else {
+    return std::make_unique<InferResponse>(
+        std::vector<std::shared_ptr<PbTensor>>{},
+        std::make_shared<PbError>(
+            "An error occurred while performing BLS request."));
+  }
+}
+#endif
+
 }}}  // namespace triton::backend::python
diff --git a/src/infer_request.h b/src/infer_request.h
@@ -27,6 +27,7 @@
 #pragma once
 
 #include <string>
+#include "infer_response.h"
 #include "pb_tensor.h"
 
 namespace triton { namespace backend { namespace python {
@@ -58,12 +59,14 @@ class InferRequest {
   /// space to save the inference request.
   void SaveToSharedMemory(
       std::unique_ptr<SharedMemory>& shm_pool, Request* request_shm);
-  
+
   /// Create an Inference Request object from shared memory.
   /// \param shm_pool Shared memory pool
   /// \param request_offset Shared memory offset of the request.
   static std::unique_ptr<InferRequest> LoadFromSharedMemory(
-      std::unique_ptr<SharedMemory>& shm_pool,
-      off_t request_offset);
+      std::unique_ptr<SharedMemory>& shm_pool, off_t request_offset);
+#ifdef TRITON_PB_STUB
+  std::unique_ptr<InferResponse> Exec();
+#endif
 };
 }}};  // namespace triton::backend::python
diff --git a/src/infer_response.cc b/src/infer_response.cc
@@ -60,7 +60,7 @@ InferResponse::IsErrorMessageSet()
 
 void
 InferResponse::SaveToSharedMemory(
-    std::unique_ptr<SharedMemory>& shm_pool, Response* response_shm)
+    std::unique_ptr<SharedMemory>& shm_pool, Response* response_shm, bool copy)
 {
   size_t output_tensor_length = output_tensors_.size();
   response_shm->has_error = false;
@@ -77,11 +77,12 @@ InferResponse::SaveToSharedMemory(
   size_t j = 0;
   for (auto& output_tensor : output_tensors_) {
     Tensor* output_tensor_shm = &output_tensors_shm[j];
-    output_tensor->SaveToSharedMemory(shm_pool, output_tensor_shm);
+    output_tensor->SaveToSharedMemory(shm_pool, output_tensor_shm, copy);
     j++;
   }
 
   if (this->HasError()) {
+    response_shm->has_error = true;
     off_t error_offset;
     SaveStringToSharedMemory(
         shm_pool, error_offset, this->Error()->Message().c_str());

diff --git a/src/infer_response.h b/src/infer_response.h
@@ -44,7 +44,7 @@ class InferResponse {
   bool IsErrorMessageSet();
   std::vector<std::shared_ptr<PbTensor>>& OutputTensors();
   void SaveToSharedMemory(
-      std::unique_ptr<SharedMemory>& shm_pool, Response* response_shm);
+      std::unique_ptr<SharedMemory>& shm_pool, Response* response_shm, bool copy);
   static std::unique_ptr<InferResponse> LoadFromSharedMemory(
       std::unique_ptr<SharedMemory>& shm_pool, off_t response_offset);
   bool HasError();