From ad1f7c1c8c45544c45e1e4cf81e4cdcc505a3506 Mon Sep 17 00:00:00 2001
From: Olga Andreeva <124622579+oandreeva-nv@users.noreply.github.com>
Date: Fri, 15 Mar 2024 16:11:41 -0700
Subject: [PATCH] Exposing trace context to python backend (#346) (#347)

Exposing trace context to python backend
---
 CMakeLists.txt          |   2 +
 src/infer_request.cc    |  17 +++++--
 src/infer_request.h     |  24 ++--------
 src/infer_trace.cc      | 101 ++++++++++++++++++++++++++++++++++++++++
 src/infer_trace.h       |  90 +++++++++++++++++++++++++++++++++++
 src/pb_stub.cc          |  11 ++++-
 src/python_be.cc        |  14 +++++-
 src/request_executor.cc |   6 ++-
 8 files changed, 235 insertions(+), 30 deletions(-)
 create mode 100644 src/infer_trace.cc
 create mode 100644 src/infer_trace.h
diff --git a/CMakeLists.txt b/CMakeLists.txt
index dacd0f9c..46f4bfe4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -153,6 +153,8 @@ set(
   src/infer_response.h
   src/infer_request.cc
   src/infer_request.h
+  src/infer_trace.cc
+  src/infer_trace.h
   src/message_queue.h
   src/ipc_message.cc
   src/ipc_message.h
diff --git a/src/infer_request.cc b/src/infer_request.cc
index f18900d0..aa34447e 100644
--- a/src/infer_request.cc
+++ b/src/infer_request.cc
@@ -170,7 +170,7 @@ InferRequest::GetPreferredMemory()
 }
 
 InferenceTrace&
-InferRequest::Trace()
+InferRequest::GetTrace()
 {
   return trace_;
 }
@@ -214,7 +214,6 @@ InferRequest::SaveToSharedMemory(std::unique_ptr<SharedMemoryManager>& shm_pool)
   infer_request_shm_ptr_->is_decoupled = is_decoupled_;
   infer_request_shm_ptr_->timeout = timeout_;
   infer_request_shm_ptr_->preferred_memory = preferred_memory_;
-  infer_request_shm_ptr_->trace = trace_;
   infer_request_shm_ptr_->request_release_flags = request_release_flags_;
 
   output_names_handle_shm_ptr_ =
@@ -271,6 +270,9 @@ InferRequest::SaveToSharedMemory(std::unique_ptr<SharedMemoryManager>& shm_pool)
       reinterpret_cast<char*>(infer_request_shm_ptr_) + parameters_offset,
       infer_request_shm.handle_ + parameters_offset);
 
+  trace_.SaveToSharedMemory(shm_pool);
+  infer_request_shm_ptr_->trace_shm_handle = trace_.ShmHandle();
+
   // Save the references to shared memory.
   infer_request_shm_ = std::move(infer_request_shm);
   request_id_shm_ = std::move(request_id_shm);
@@ -327,6 +329,10 @@ InferRequest::LoadFromSharedMemory(
       (infer_request_shm_ptr->input_count *
        sizeof(bi::managed_external_buffer::handle_t));
 
+  std::unique_ptr<InferenceTrace> infer_trace_shm =
+      InferenceTrace::LoadFromSharedMemory(
+          shm_pool, infer_request_shm_ptr->trace_shm_handle);
+
   std::unique_ptr<PbString> model_name_shm = PbString::LoadFromSharedMemory(
       request_handle + model_name_offset,
       reinterpret_cast<char*>(infer_request_shm_ptr) + model_name_offset);
@@ -343,7 +349,7 @@ InferRequest::LoadFromSharedMemory(
 
   return std::unique_ptr<InferRequest>(new InferRequest(
       infer_request_shm, request_id_shm, requested_output_names_shm,
-      model_name_shm, input_tensors, parameters_shm));
+      model_name_shm, input_tensors, parameters_shm, infer_trace_shm));
 }
 
 InferRequest::InferRequest(
@@ -352,7 +358,8 @@ InferRequest::InferRequest(
     std::vector<std::unique_ptr<PbString>>& requested_output_names_shm,
     std::unique_ptr<PbString>& model_name_shm,
     std::vector<std::shared_ptr<PbTensor>>& input_tensors,
-    std::unique_ptr<PbString>& parameters_shm)
+    std::unique_ptr<PbString>& parameters_shm,
+    std::unique_ptr<InferenceTrace>& infer_trace_shm)
     : infer_request_shm_(std::move(infer_request_shm)),
       request_id_shm_(std::move(request_id_shm)),
       requested_output_names_shm_(std::move(requested_output_names_shm)),
@@ -393,7 +400,7 @@ InferRequest::InferRequest(
   is_decoupled_ = infer_request_shm_ptr_->is_decoupled;
   timeout_ = infer_request_shm_ptr_->timeout;
   preferred_memory_ = infer_request_shm_ptr_->preferred_memory;
-  trace_ = infer_request_shm_ptr_->trace;
+  trace_ = InferenceTrace(infer_trace_shm);
   request_release_flags_ = infer_request_shm_ptr_->request_release_flags;
 
 #ifdef TRITON_PB_STUB
diff --git a/src/infer_request.h b/src/infer_request.h
index ba586535..9bf9dfdb 100644
--- a/src/infer_request.h
+++ b/src/infer_request.h
@@ -30,6 +30,7 @@
 #include <string>
 
 #include "infer_response.h"
+#include "infer_trace.h"
 #include "pb_preferred_memory.h"
 #include "pb_tensor.h"
 
@@ -42,22 +43,6 @@ namespace triton { namespace backend { namespace python {
 
 class Stub;
 
-//
-// Inference Trace
-//
-struct InferenceTrace {
-#ifndef TRITON_PB_STUB
-  TRITONSERVER_InferenceTrace* triton_trace_;
-  InferenceTrace(TRITONSERVER_InferenceTrace* triton_trace)
-      : triton_trace_(triton_trace)
-  {
-  }
-#else
-  void* triton_trace_;
-#endif
-  InferenceTrace() : triton_trace_(nullptr) {}
-};
-
 //
 // Inference Request
 //
@@ -72,7 +57,7 @@ struct InferRequestShm {
   bool is_decoupled;
   uint64_t timeout;
   PreferredMemory preferred_memory;
-  InferenceTrace trace;
+  bi::managed_external_buffer::handle_t trace_shm_handle;
   uint32_t request_release_flags;
 };
 
@@ -104,7 +89,7 @@ class InferRequest {
   bool IsDecoupled();
   void SetIsDecoupled(const bool is_decoupled);
   PreferredMemory& GetPreferredMemory();
-  InferenceTrace& Trace();
+  InferenceTrace& GetTrace();
   uint32_t ReleaseFlags();
   void SetReleaseFlags(const uint32_t& flags);
 
@@ -144,7 +129,8 @@ class InferRequest {
       std::vector<std::unique_ptr<PbString>>& requested_output_names_shm,
       std::unique_ptr<PbString>& model_name_shm,
       std::vector<std::shared_ptr<PbTensor>>& input_tensors,
-      std::unique_ptr<PbString>& parameters_shm);
+      std::unique_ptr<PbString>& parameters_shm,
+      std::unique_ptr<InferenceTrace>& infer_trace_shm);
 
   std::string request_id_;
   uint64_t correlation_id_;
diff --git a/src/infer_trace.cc b/src/infer_trace.cc
new file mode 100644
index 00000000..50645dcc
--- /dev/null
+++ b/src/infer_trace.cc
@@ -0,0 +1,101 @@
+// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "infer_trace.h"
+
+namespace triton { namespace backend { namespace python {
+
+InferenceTrace::InferenceTrace(const InferenceTrace& rhs)
+{
+  triton_trace_ = rhs.triton_trace_;
+  trace_context_ = rhs.trace_context_;
+}
+
+InferenceTrace&
+InferenceTrace::operator=(const InferenceTrace& rhs)
+{
+  triton_trace_ = rhs.triton_trace_;
+  trace_context_ = rhs.trace_context_;
+  return *this;
+}
+
+InferenceTrace::InferenceTrace(std::unique_ptr<InferenceTrace>& trace_shm)
+{
+  triton_trace_ = trace_shm->triton_trace_;
+  trace_context_ = trace_shm->trace_context_;
+}
+
+void
+InferenceTrace::SaveToSharedMemory(
+    std::unique_ptr<SharedMemoryManager>& shm_pool)
+{
+  AllocatedSharedMemory<InferenceTraceShm> infer_trace_shm =
+      shm_pool->Construct<InferenceTraceShm>();
+  infer_trace_shm_ptr_ = infer_trace_shm.data_.get();
+
+  infer_trace_shm_ptr_->triton_trace = triton_trace_;
+
+  std::unique_ptr<PbString> trace_context_shm =
+      PbString::Create(shm_pool, trace_context_);
+
+  infer_trace_shm_ptr_->trace_context_shm_handle =
+      trace_context_shm->ShmHandle();
+
+  // Save the references to shared memory.
+  trace_context_shm_ = std::move(trace_context_shm);
+  infer_trace_shm_ = std::move(infer_trace_shm);
+  shm_handle_ = infer_trace_shm_.handle_;
+}
+
+std::unique_ptr<InferenceTrace>
+InferenceTrace::LoadFromSharedMemory(
+    std::unique_ptr<SharedMemoryManager>& shm_pool,
+    bi::managed_external_buffer::handle_t handle)
+{
+  AllocatedSharedMemory<InferenceTraceShm> infer_trace_shm =
+      shm_pool->Load<InferenceTraceShm>(handle);
+  InferenceTraceShm* infer_trace_shm_ptr = infer_trace_shm.data_.get();
+
+  std::unique_ptr<PbString> trace_context_shm = PbString::LoadFromSharedMemory(
+      shm_pool, infer_trace_shm_ptr->trace_context_shm_handle);
+
+  return std::unique_ptr<InferenceTrace>(
+      new InferenceTrace(infer_trace_shm, trace_context_shm));
+}
+
+InferenceTrace::InferenceTrace(
+    AllocatedSharedMemory<InferenceTraceShm>& infer_trace_shm,
+    std::unique_ptr<PbString>& trace_context_shm)
+    : infer_trace_shm_(std::move(infer_trace_shm)),
+      trace_context_shm_(std::move(trace_context_shm))
+{
+  infer_trace_shm_ptr_ = infer_trace_shm_.data_.get();
+  shm_handle_ = infer_trace_shm_.handle_;
+  triton_trace_ = infer_trace_shm_ptr_->triton_trace;
+  trace_context_ = trace_context_shm_->String();
+}
+
+}}};  // namespace triton::backend::python
diff --git a/src/infer_trace.h b/src/infer_trace.h
new file mode 100644
index 00000000..aac9137f
--- /dev/null
+++ b/src/infer_trace.h
@@ -0,0 +1,90 @@
+// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include <string>
+
+#include "pb_string.h"
+#include "pb_utils.h"
+
+namespace triton { namespace backend { namespace python {
+
+struct InferenceTraceShm {
+  bi::managed_external_buffer::handle_t trace_context_shm_handle;
+  // The address of the 'TRITONSERVER_InferTrace' object.
+  void* triton_trace;
+};
+
+//
+// Inference Trace
+//
+class InferenceTrace {
+ public:
+  InferenceTrace(void* triton_trace, const std::string& ctxt)
+      : triton_trace_(triton_trace), trace_context_(ctxt)
+  {
+  }
+  InferenceTrace() : triton_trace_(nullptr), trace_context_("") {}
+  InferenceTrace(const InferenceTrace& rhs);
+  InferenceTrace(std::unique_ptr<InferenceTrace>& trace_shm);
+  InferenceTrace& operator=(const InferenceTrace& rhs);
+  /// Save InferenceTrace object to shared memory.
+  /// \param shm_pool Shared memory pool to save the InferenceTrace object.
+  void SaveToSharedMemory(std::unique_ptr<SharedMemoryManager>& shm_pool);
+
+  /// Create a InferenceTrace object from shared memory.
+  /// \param shm_pool Shared memory pool
+  /// \param handle Shared memory handle of the InferenceTrace.
+  /// \return Returns the InferenceTrace in the specified handle
+  /// location.
+  static std::unique_ptr<InferenceTrace> LoadFromSharedMemory(
+      std::unique_ptr<SharedMemoryManager>& shm_pool,
+      bi::managed_external_buffer::handle_t handle);
+
+  void* TritonTrace() { return triton_trace_; }
+  const std::string& Context() const { return trace_context_; }
+
+  bi::managed_external_buffer::handle_t ShmHandle() { return shm_handle_; }
+
+ private:
+  // The private constructor for creating a InferenceTrace object from shared
+  // memory.
+  InferenceTrace(
+      AllocatedSharedMemory<InferenceTraceShm>& infer_trace_shm,
+      std::unique_ptr<PbString>& trace_context_shm);
+
+  void* triton_trace_;
+  std::string trace_context_;
+
+  // Shared Memory Data Structures
+  AllocatedSharedMemory<InferenceTraceShm> infer_trace_shm_;
+  InferenceTraceShm* infer_trace_shm_ptr_;
+  bi::managed_external_buffer::handle_t shm_handle_;
+  std::unique_ptr<PbString> trace_context_shm_;
+};
+
+}}};  // namespace triton::backend::python
diff --git a/src/pb_stub.cc b/src/pb_stub.cc
index 26003f71..695b02f5 100644
--- a/src/pb_stub.cc
+++ b/src/pb_stub.cc
@@ -1610,7 +1610,14 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module)
       .export_values();
 
   py::class_<InferenceTrace, std::shared_ptr<InferenceTrace>>(
-      module, "InferenceTrace");
+      module, "InferenceTrace")
+      .def("get_context", [](InferenceTrace& self) -> py::object {
+        auto context = self.Context();
+        if (context != "") {
+          return py::str(context);
+        }
+        return py::none();
+      });
 
   py::class_<InferRequest, std::shared_ptr<InferRequest>>(
       module, "InferenceRequest")
@@ -1674,7 +1681,7 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module)
       .def("set_flags", &InferRequest::SetFlags)
       .def("timeout", &InferRequest::Timeout)
       .def("parameters", &InferRequest::Parameters)
-      .def("trace", &InferRequest::Trace)
+      .def("trace", &InferRequest::GetTrace)
       .def(
           "exec",
           [](std::shared_ptr<InferRequest>& infer_request,
diff --git a/src/python_be.cc b/src/python_be.cc
index 0fa318ff..b9ba7302 100644
--- a/src/python_be.cc
+++ b/src/python_be.cc
@@ -371,14 +371,25 @@ ModelInstanceState::SaveRequestsToSharedMemory(
 
     // Do not return if error in this case, because Triton core
     // will return an error if tracing is disabled (see PYBE PR#295).
+    // For the same reason, we do not log the error message, otherwise
+    // when Triton is compiled without tracing, it'll constantly log
+    // this error.
     TRITONSERVER_InferenceTrace* triton_trace;
     auto err = TRITONBACKEND_RequestTrace(request, &triton_trace);
     if (err != nullptr) {
       triton_trace = nullptr;
       TRITONSERVER_ErrorDelete(err);
     }
+    const char* val = nullptr;
+    if (triton_trace != nullptr) {
+      LOG_IF_ERROR(
+          TRITONSERVER_InferenceTraceContext(triton_trace, &val),
+          "failed to retrieve trace context");
+    }
+    std::string context = (val != nullptr) ? std::string(val) : "";
 
-    InferenceTrace trace = InferenceTrace(triton_trace);
+    InferenceTrace trace =
+        InferenceTrace(reinterpret_cast<void*>(triton_trace), context);
 
     uint64_t request_timeout;
     RETURN_IF_ERROR(TRITONBACKEND_InferenceRequestTimeoutMicroseconds(
@@ -403,7 +414,6 @@ ModelInstanceState::SaveRequestsToSharedMemory(
           reinterpret_cast<intptr_t>(request),
           PreferredMemory(PreferredMemory::kDefault, 0), trace);
     }
-
     RETURN_IF_EXCEPTION(infer_request->SaveToSharedMemory(Stub()->ShmPool()));
     requests_shm[r] = infer_request->ShmHandle();
     pb_infer_requests.emplace_back(std::move(infer_request));
diff --git a/src/request_executor.cc b/src/request_executor.cc
index d78972a5..39a4b9b6 100644
--- a/src/request_executor.cc
+++ b/src/request_executor.cc
@@ -367,9 +367,11 @@ RequestExecutor::Infer(
         irequest, InferRequestComplete, nullptr /* request_release_userp */));
 
     TRITONSERVER_InferenceTrace* trace = nullptr;
-    if (infer_request->Trace().triton_trace_ != nullptr) {
+    if (infer_request->GetTrace().TritonTrace() != nullptr) {
       THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceTraceSpawnChildTrace(
-          infer_request->Trace().triton_trace_, &trace));
+          reinterpret_cast<TRITONSERVER_InferenceTrace*>(
+              infer_request->GetTrace().TritonTrace()),
+          &trace));
     }
 
     const std::string& param_str = infer_request->Parameters();