Skip to content

Commit

Permalink
Fix the multi-instance overlap in TRT backend (#21)
Browse files Browse the repository at this point in the history
* Fix the binding initializing logic to correctly detect repeated tensors

* Fix the multi-instance overlap in TRT backend
  • Loading branch information
tanmayv25 authored Nov 9, 2021
1 parent 4f1190b commit 21942bb
Show file tree
Hide file tree
Showing 3 changed files with 111 additions and 23 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ add_library(
src/tensorrt_model_instance.h
src/tensorrt_utils.cc
src/tensorrt_utils.h
src/semaphore.h
src/loader.cc
src/loader.h
src/logging.cc
Expand Down
57 changes: 57 additions & 0 deletions src/semaphore.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
// Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <condition_variable>
#include <mutex>

namespace triton { namespace backend { namespace tensorrt {

class Semaphore {
public:
explicit Semaphore(const int count) : count_(count) {}

void Release()
{
std::unique_lock<std::mutex> lck(mtx_);
count_++;
cv_.notify_one();
}

void Acquire()
{
std::unique_lock<std::mutex> lck(mtx_);
cv_.wait(lck, [this]() { return (count_ > 0); });
count_--;
}

private:
int count_;

std::mutex mtx_;
std::condition_variable cv_;
};

}}} // namespace triton::backend::tensorrt
76 changes: 53 additions & 23 deletions src/tensorrt.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include <future>
#include "loader.h"
#include "logging.h"
#include "semaphore.h"
#include "tensorrt_model.h"
#include "tensorrt_model_instance.h"
#include "tensorrt_utils.h"
Expand Down Expand Up @@ -221,6 +222,23 @@ class ModelState : public TensorRTModel {
void DisableEngineSharing() { engine_sharing_ = false; }
bool IsEngineSharingEnabled() { return engine_sharing_; }

struct SemaphoreContext {
SemaphoreContext() : next_sem_idx_(0) {}

std::vector<std::unique_ptr<Semaphore>> semaphore_list_;
int next_sem_idx_;
};

std::map<int, std::unique_ptr<SemaphoreContext>>& SemaphoreMap()
{
return semaphore_map_;
}

std::unique_ptr<SemaphoreContext>& SemaphoreDeviceContext(const int device_id)
{
return semaphore_map_[device_id];
}

private:
ModelState(TRITONBACKEND_Model* triton_model);

Expand Down Expand Up @@ -264,6 +282,9 @@ class ModelState : public TensorRTModel {
std::shared_ptr<nvinfer1::ICudaEngine>>>
device_engines_;
bool engine_sharing_;

// A map between device id to its semaphore context
std::map<int, std::unique_ptr<SemaphoreContext>> semaphore_map_;
};

TRITONSERVER_Error*
Expand Down Expand Up @@ -976,7 +997,7 @@ class ModelInstanceState : public TensorRTModelInstance {
ModelState* model_state,
TRITONBACKEND_ModelInstance* triton_model_instance);

void RegisterContexts();
void RegisterSemaphore();
TRITONSERVER_Error* InitStreamsAndEvents();
TRITONSERVER_Error* InitEventSet(bool busy_wait_events);
TRITONSERVER_Error* DestroyEventSet();
Expand Down Expand Up @@ -1203,19 +1224,16 @@ class ModelInstanceState : public TensorRTModelInstance {
// executions' event states.
std::thread completion_thread_;

triton::common::SyncQueue<size_t> context_queue_;
size_t next_context_idx_;

// The details needed by the completion thread to finalize the
// response for a model execution.
struct Payload {
explicit Payload(
size_t event_set_idx, TRITONBACKEND_Request** requests,
uint32_t request_count, size_t context_idx)
uint32_t request_count, size_t sem_idx)
: event_set_idx_(event_set_idx), total_batch_size_(0),
compute_start_ns_(0), compute_input_end_ns_(0),
compute_output_start_ns_(0), requests_(requests),
request_count_(request_count), context_idx_(context_idx)
request_count_(request_count), sem_idx_(sem_idx)
{
}

Expand All @@ -1234,7 +1252,7 @@ class ModelInstanceState : public TensorRTModelInstance {
std::vector<TRITONBACKEND_Request*> requests_list_;
TRITONBACKEND_Request** requests_;
uint32_t request_count_;
size_t context_idx_;
size_t sem_idx_;

// All the generated InferenceResponse objects
std::vector<TRITONBACKEND_Response*> responses_;
Expand Down Expand Up @@ -1360,7 +1378,7 @@ ModelInstanceState::Create(
"' for model instance '" + (*state)->Name() + "'");
}

(*state)->RegisterContexts();
(*state)->RegisterSemaphore();
RETURN_IF_ERROR((*state)->InitStreamsAndEvents());
RETURN_IF_ERROR(model_state->CreateEngine(
(*state)->DeviceId(), (*state)->DLACoreId(), model_path,
Expand Down Expand Up @@ -1579,9 +1597,11 @@ ModelInstanceState::ProcessRequests(
std::to_string(request_count) + " requests")
.c_str());

auto context_idx = next_context_idx_;
auto& sem_context = (model_state_->SemaphoreDeviceContext(DeviceId()));

auto sem_idx = sem_context->next_sem_idx_;

Run(requests, request_count, context_idx);
Run(requests, request_count, sem_idx);

bool run_failed = true;
for (size_t i = 0; i < request_count; ++i) {
Expand All @@ -1597,7 +1617,7 @@ ModelInstanceState::ProcessRequests(
if (run_failed) {
// On inference error, place the slot back to the queue
// immediately as all works for the slot should be ignored.
context_queue_.Put(context_idx);
sem_context->semaphore_list_[sem_idx]->Release();
} else {
auto event_set_idx = next_set_;
next_set_ = (event_set_idx + 1) % EVENT_SET_COUNT;
Expand All @@ -1620,7 +1640,9 @@ ModelInstanceState::ProcessRequests(
}

// Block the execution if there are no available contexts.
next_context_idx_ = context_queue_.Get();
sem_context->next_sem_idx_ =
(sem_idx + 1) % sem_context->semaphore_list_.size();
sem_context->semaphore_list_[sem_idx]->Acquire();
}

void
Expand Down Expand Up @@ -2528,7 +2550,9 @@ ModelInstanceState::ProcessResponse()
// slots so that it can begin enqueuing new memcpys into the input
// buffers
cudaEventSynchronize(event_set.ready_for_input_);
context_queue_.Put(payload->context_idx_);
(model_state_->SemaphoreDeviceContext(DeviceId()))
->semaphore_list_[payload->sem_idx_]
->Release();
NVTX_MARKER("plan_input_available");

// Call Finalize() here to defer CUDA synchronization as much as
Expand Down Expand Up @@ -2963,22 +2987,28 @@ ModelInstanceState::DestroyEventSet()
}

void
ModelInstanceState::RegisterContexts()
ModelInstanceState::RegisterSemaphore()
{
size_t context_idx = 0;
context_queue_.Put(context_idx++);
// If eager batching is set, we add additional slots per device
// If eager batching is set, we add to the semaphore resource count
// which allows to start preparing next batch before the previous
// batch has completed. The number of duplicates are limitedby
// batch has completed. The number of duplicates are limited by
// number of event sets to prevent too many iterations are run
// ahead and to avoid interference of the event communication in
// the previous execution
if (model_state_->EagerBatching()) {
for (int count = 1; count < EVENT_SET_COUNT; ++count) {
context_queue_.Put(context_idx++);
}
int sem_count = (model_state_->EagerBatching()) ? EVENT_SET_COUNT : 1;
auto it = (model_state_->SemaphoreMap()).find(DeviceId());
if (it == (model_state_->SemaphoreMap()).end()) {
it = (model_state_->SemaphoreMap())
.emplace(
std::make_pair(DeviceId(), new ModelState::SemaphoreContext()))
.first;
}
it->second->semaphore_list_.emplace_back(new Semaphore(sem_count));

if (it->second->semaphore_list_.size() == 1) {
// Need to acquire a semaphore for first inference request
it->second->semaphore_list_[it->second->next_sem_idx_]->Acquire();
}
next_context_idx_ = context_queue_.Get();
}

TRITONSERVER_Error*
Expand Down

0 comments on commit 21942bb

Please sign in to comment.