Skip to content

Commit

Permalink
Add BLS sync (#68)
Browse files Browse the repository at this point in the history
* Move InferenceRequest implementation to C++

* Add BLS sync

* Remove the support for GPU tensors by default

* Fix error handling for BLS requests

* Refactor

* Fix response deletion if there is an error

* Add get_output_tensor_by_name to Python backend utils

* Minor improvements and bug fixes

* Minor bug fix in releasing the bls inference responses
  • Loading branch information
Tabrizian authored Aug 6, 2021
1 parent 884ff52 commit 4c01991
Show file tree
Hide file tree
Showing 16 changed files with 607 additions and 164 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/build
/.vscode
*.so
builddir

### Python ###
# Byte-compiled / optimized / DLL files
Expand Down
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ project(tritonpythonbackend LANGUAGES C CXX)
# because python backend does not need to access CUDA or GPUs
#
option(TRITON_ENABLE_GPU "Enable GPU support in backend" OFF)
option(TRITON_ENABLE_GPU_TENSORS "Allow GPU input and output tensors" OFF)
option(TRITON_ENABLE_STATS "Include statistics collections in backend" ON)

set(TRITON_BACKEND_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/backend repo")
Expand Down
83 changes: 82 additions & 1 deletion src/infer_request.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include "infer_request.h"

#include "pb_utils.h"
#ifdef TRITON_PB_STUB
#include "infer_response.h"
#include "pb_stub.h"
Expand Down Expand Up @@ -148,4 +148,85 @@ InferRequest::LoadFromSharedMemory(
requested_output_names, model_name, request->model_version);
}

#ifdef TRITON_PB_STUB
std::unique_ptr<InferResponse>
InferRequest::Exec()
{
ResponseBatch* response_batch = nullptr;
bool responses_is_set = false;
std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
std::unique_ptr<SharedMemory>& shm_pool = stub->GetSharedMemory();
IPCMessage* ipc_message = stub->GetIPCMessage();
try {
ipc_message->stub_command = PYTHONSTUB_CommandType::PYTHONSTUB_Execute;

ExecuteArgs* exec_args;
shm_pool->Map(
(char**)&exec_args, sizeof(ExecuteArgs), ipc_message->stub_args);

RequestBatch* request_batch;
shm_pool->Map(
(char**)&request_batch, sizeof(RequestBatch), exec_args->request_batch);
request_batch->batch_size = 1;

Request* request;
shm_pool->Map((char**)&request, sizeof(Request), request_batch->requests);

request->requested_input_count = this->Inputs().size();
Tensor* tensors;
shm_pool->Map(
(char**)&tensors, sizeof(Tensor) * request->requested_input_count,
request->inputs);

// TODO: Custom handling for GPU
size_t i = 0;
for (auto& input_tensor : this->Inputs()) {
input_tensor->SaveToSharedMemory(shm_pool, &tensors[i]);
i += 1;
}
this->SaveToSharedMemory(shm_pool, request);

ipc_message->stub_command =
PYTHONSTUB_CommandType::PYTHONSTUB_InferExecRequest;
stub->NotifyParent();
stub->WaitForNotification();

ipc_message->stub_command = PYTHONSTUB_CommandType::PYTHONSTUB_Execute;

shm_pool->MapOffset((char**)&response_batch, exec_args->response_batch);
responses_is_set = true;

if (response_batch->has_error) {
if (response_batch->is_error_set) {
char* err_string;
LoadStringFromSharedMemory(shm_pool, response_batch->error, err_string);
return std::make_unique<InferResponse>(
std::vector<std::shared_ptr<PbTensor>>{},
std::make_shared<PbError>(err_string));
} else {
return std::make_unique<InferResponse>(
std::vector<std::shared_ptr<PbTensor>>{},
std::make_shared<PbError>(
"An error occurred while performing BLS request."));
}
}
}
catch (const PythonBackendException& pb_exception) {
return std::make_unique<InferResponse>(
std::vector<std::shared_ptr<PbTensor>>{},
std::make_shared<PbError>(pb_exception.what()));
}

if (responses_is_set) {
return InferResponse::LoadFromSharedMemory(
shm_pool, response_batch->responses);
} else {
return std::make_unique<InferResponse>(
std::vector<std::shared_ptr<PbTensor>>{},
std::make_shared<PbError>(
"An error occurred while performing BLS request."));
}
}
#endif

}}} // namespace triton::backend::python
9 changes: 6 additions & 3 deletions src/infer_request.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#pragma once

#include <string>
#include "infer_response.h"
#include "pb_tensor.h"

namespace triton { namespace backend { namespace python {
Expand Down Expand Up @@ -58,12 +59,14 @@ class InferRequest {
/// space to save the inference request.
void SaveToSharedMemory(
std::unique_ptr<SharedMemory>& shm_pool, Request* request_shm);

/// Create an Inference Request object from shared memory.
/// \param shm_pool Shared memory pool
/// \param request_offset Shared memory offset of the request.
static std::unique_ptr<InferRequest> LoadFromSharedMemory(
std::unique_ptr<SharedMemory>& shm_pool,
off_t request_offset);
std::unique_ptr<SharedMemory>& shm_pool, off_t request_offset);
#ifdef TRITON_PB_STUB
std::unique_ptr<InferResponse> Exec();
#endif
};
}}}; // namespace triton::backend::python
5 changes: 3 additions & 2 deletions src/infer_response.cc
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ InferResponse::IsErrorMessageSet()

void
InferResponse::SaveToSharedMemory(
std::unique_ptr<SharedMemory>& shm_pool, Response* response_shm)
std::unique_ptr<SharedMemory>& shm_pool, Response* response_shm, bool copy)
{
size_t output_tensor_length = output_tensors_.size();
response_shm->has_error = false;
Expand All @@ -77,11 +77,12 @@ InferResponse::SaveToSharedMemory(
size_t j = 0;
for (auto& output_tensor : output_tensors_) {
Tensor* output_tensor_shm = &output_tensors_shm[j];
output_tensor->SaveToSharedMemory(shm_pool, output_tensor_shm);
output_tensor->SaveToSharedMemory(shm_pool, output_tensor_shm, copy);
j++;
}

if (this->HasError()) {
response_shm->has_error = true;
off_t error_offset;
SaveStringToSharedMemory(
shm_pool, error_offset, this->Error()->Message().c_str());
Expand Down
2 changes: 1 addition & 1 deletion src/infer_response.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ class InferResponse {
bool IsErrorMessageSet();
std::vector<std::shared_ptr<PbTensor>>& OutputTensors();
void SaveToSharedMemory(
std::unique_ptr<SharedMemory>& shm_pool, Response* response_shm);
std::unique_ptr<SharedMemory>& shm_pool, Response* response_shm, bool copy);
static std::unique_ptr<InferResponse> LoadFromSharedMemory(
std::unique_ptr<SharedMemory>& shm_pool, off_t response_offset);
bool HasError();
Expand Down
Loading

0 comments on commit 4c01991

Please sign in to comment.