diff --git a/src/metrics.cc b/src/metrics.cc index b5e341d61..28ab921ca 100644 --- a/src/metrics.cc +++ b/src/metrics.cc @@ -32,6 +32,7 @@ #include #include "constants.h" +#include "pinned_memory_manager.h" #include "prometheus/detail/utils.h" #include "triton/common/logging.h" @@ -108,6 +109,18 @@ Metrics::Metrics() "execution per-model.") .Register(*registry_)), + pinned_memory_pool_total_family_( + prometheus::BuildGauge() + .Name("nv_pinned_memory_pool_total_bytes") + .Help("Pinned memory pool total memory size, in bytes") + .Register(*registry_)), + + pinned_memory_pool_used_family_( + prometheus::BuildGauge() + .Name("nv_pinned_memory_pool_used_bytes") + .Help("Pinned memory pool used memory size, in bytes") + .Register(*registry_)), + // Per-model cache metric families // NOTE: These are used in infer_stats for perf_analyzer cache_num_hits_model_family_(prometheus::BuildCounter() @@ -222,7 +235,8 @@ Metrics::Metrics() #endif // TRITON_ENABLE_METRICS_CPU metrics_enabled_(false), gpu_metrics_enabled_(false), - cpu_metrics_enabled_(false), metrics_interval_ms_(2000) + cpu_metrics_enabled_(false), pinned_memory_metrics_enabled_(false), + metrics_interval_ms_(2000) { } @@ -295,8 +309,23 @@ Metrics::EnableMetrics() { auto singleton = GetSingleton(); singleton->metrics_enabled_ = true; + + EnablePinnedMemoryMetrics(); } +void +Metrics::EnablePinnedMemoryMetrics() +{ + auto singleton = GetSingleton(); + if (singleton->pinned_memory_metrics_enabled_) { + return; + } + // Initialize + singleton->InitializePinnedMemoryMetrics(); + singleton->pinned_memory_metrics_enabled_ = true; +} + + void Metrics::EnableGPUMetrics() { @@ -357,8 +386,10 @@ bool Metrics::StartPollingThread() { // Nothing to poll if no polling metrics enabled, don't spawn a thread - if (!gpu_metrics_enabled_ && !cpu_metrics_enabled_) { - LOG_WARNING << "No polling metrics (CPU, GPU) are enabled. Will not " + if (!gpu_metrics_enabled_ && !cpu_metrics_enabled_ && + !pinned_memory_metrics_enabled_) { + LOG_WARNING << "No polling metrics (CPU, GPU, Pinned memory) are " + "enabled. Will not " "poll for them."; return false; } @@ -372,6 +403,10 @@ Metrics::StartPollingThread() std::this_thread::sleep_for( std::chrono::milliseconds(metrics_interval_ms_ / 2)); + if (pinned_memory_metrics_enabled_) { + PollPinnedMemoryMetrics(); + } + #ifdef TRITON_ENABLE_METRICS_GPU // Poll DCGM GPU metrics if (gpu_metrics_enabled_ && @@ -391,6 +426,20 @@ Metrics::StartPollingThread() return true; } +bool +Metrics::PollPinnedMemoryMetrics() +{ + uint64_t pinned_memory_byte_size = + PinnedMemoryManager::GetTotalPinnedMemoryByteSize(); + uint64_t used_pinned_memory_byte_size = + PinnedMemoryManager::GetUsedPinnedMemoryByteSize(); + + pinned_memory_pool_total_->Set(pinned_memory_byte_size); + pinned_memory_pool_used_->Set(used_pinned_memory_byte_size); + + return true; +} + #ifdef TRITON_ENABLE_METRICS_CPU Status Metrics::ParseCpuInfo(CpuInfo& info) @@ -676,6 +725,17 @@ Metrics::PollDcgmMetrics() #endif // TRITON_ENABLE_METRICS_GPU } +bool +Metrics::InitializePinnedMemoryMetrics() +{ + const std::map pinned_memory_labels; + pinned_memory_pool_total_ = + &pinned_memory_pool_total_family_.Add(pinned_memory_labels); + pinned_memory_pool_used_ = + &pinned_memory_pool_used_family_.Add(pinned_memory_labels); + return true; +} + bool Metrics::InitializeCpuMetrics() { diff --git a/src/metrics.h b/src/metrics.h index 70b416eef..2abec1cf3 100644 --- a/src/metrics.h +++ b/src/metrics.h @@ -117,6 +117,9 @@ class Metrics { // Enable reporting of metrics static void EnableMetrics(); + // Enable reporting of Pinned memory metrics + static void EnablePinnedMemoryMetrics(); + // Enable reporting of GPU metrics static void EnableGPUMetrics(); @@ -270,7 +273,9 @@ class Metrics { static Metrics* GetSingleton(); bool InitializeDcgmMetrics(); bool InitializeCpuMetrics(); + bool InitializePinnedMemoryMetrics(); bool StartPollingThread(); + bool PollPinnedMemoryMetrics(); bool PollDcgmMetrics(); bool PollCpuMetrics(); @@ -295,6 +300,11 @@ class Metrics { inf_compute_output_duration_us_family_; prometheus::Family& inf_pending_request_count_family_; + prometheus::Family& pinned_memory_pool_total_family_; + prometheus::Family& pinned_memory_pool_used_family_; + prometheus::Gauge* pinned_memory_pool_total_; + prometheus::Gauge* pinned_memory_pool_used_; + // Per-model Response Cache metrics // NOTE: Per-model metrics are used in infer_stats for perf_analyzer. Global // cache metrics will be implemented by cache and published through @@ -356,6 +366,7 @@ class Metrics { bool metrics_enabled_; bool gpu_metrics_enabled_; bool cpu_metrics_enabled_; + bool pinned_memory_metrics_enabled_; bool poll_thread_started_; std::mutex metrics_enabling_; std::mutex poll_thread_starting_; diff --git a/src/pinned_memory_manager.cc b/src/pinned_memory_manager.cc index 0321bb787..e13a57a81 100644 --- a/src/pinned_memory_manager.cc +++ b/src/pinned_memory_manager.cc @@ -65,12 +65,16 @@ ParseIntOption(const std::string& msg, const std::string& arg, int* value) } // namespace std::unique_ptr PinnedMemoryManager::instance_; -uint64_t PinnedMemoryManager::pinned_memory_byte_size_; +uint64_t PinnedMemoryManager::pinned_memory_byte_size_ = 0; +std::mutex PinnedMemoryManager::allocated_buffer_mtx_; +std::vector> + PinnedMemoryManager::allocated_pinned_memory_buffers_; PinnedMemoryManager::PinnedMemory::PinnedMemory( void* pinned_memory_buffer, uint64_t size) : pinned_memory_buffer_(pinned_memory_buffer) { + used_pinned_memory_byte_size_ = 0; if (pinned_memory_buffer_ != nullptr) { managed_pinned_memory_ = boost::interprocess::managed_external_buffer( boost::interprocess::create_only_t{}, pinned_memory_buffer_, size); @@ -87,9 +91,39 @@ PinnedMemoryManager::PinnedMemory::~PinnedMemory() #endif // TRITON_ENABLE_GPU } +void* +PinnedMemoryManager::PinnedMemory::Allocate(uint64_t size) +{ + std::lock_guard lk(buffer_mtx_); + void* ptr = managed_pinned_memory_.allocate(size, std::nothrow_t{}); + used_pinned_memory_byte_size_ += size; + allocated_memory_info_.emplace(ptr, size); + return ptr; +} + +void +PinnedMemoryManager::PinnedMemory::Deallocate(void* ptr) +{ + std::lock_guard lk(buffer_mtx_); + managed_pinned_memory_.deallocate(ptr); + auto it = allocated_memory_info_.find(ptr); + if (it != allocated_memory_info_.end()) { + used_pinned_memory_byte_size_ -= it->second; + allocated_memory_info_.erase(it); + } +} + +uint64_t +PinnedMemoryManager::PinnedMemory::GetUsedPinnedMemorySizeInternal() +{ + std::lock_guard lk(buffer_mtx_); + return used_pinned_memory_byte_size_; +} + PinnedMemoryManager::~PinnedMemoryManager() { // Clean up + allocated_pinned_memory_buffers_.clear(); for (const auto& memory_info : memory_info_) { const auto& is_pinned = memory_info.second.first; if (!is_pinned) { @@ -104,6 +138,10 @@ PinnedMemoryManager::AddPinnedMemoryBuffer( unsigned long node_mask) { pinned_memory_buffers_[node_mask] = pinned_memory_buffer; + { + std::lock_guard lk(allocated_buffer_mtx_); + allocated_pinned_memory_buffers_.push_back(pinned_memory_buffer); + } } Status @@ -113,9 +151,7 @@ PinnedMemoryManager::AllocInternal( { auto status = Status::Success; if (pinned_memory_buffer->pinned_memory_buffer_ != nullptr) { - std::lock_guard lk(pinned_memory_buffer->buffer_mtx_); - *ptr = pinned_memory_buffer->managed_pinned_memory_.allocate( - size, std::nothrow_t{}); + *ptr = pinned_memory_buffer->Allocate(size); *allocated_type = TRITONSERVER_MEMORY_CPU_PINNED; if (*ptr == nullptr) { status = Status( @@ -167,8 +203,7 @@ PinnedMemoryManager::AllocInternal( if ((!status.IsOk()) && (*ptr != nullptr)) { if (is_pinned) { - std::lock_guard lk(pinned_memory_buffer->buffer_mtx_); - pinned_memory_buffer->managed_pinned_memory_.deallocate(*ptr); + pinned_memory_buffer->Deallocate(*ptr); } else { free(*ptr); } @@ -201,8 +236,7 @@ PinnedMemoryManager::FreeInternal(void* ptr) } if (is_pinned) { - std::lock_guard lk(pinned_memory_buffer->buffer_mtx_); - pinned_memory_buffer->managed_pinned_memory_.deallocate(ptr); + pinned_memory_buffer->Deallocate(ptr); } else { free(ptr); } @@ -376,4 +410,24 @@ PinnedMemoryManager::Free(void* ptr) return instance_->FreeInternal(ptr); } +uint64_t +PinnedMemoryManager::GetTotalPinnedMemoryByteSize() +{ + return pinned_memory_byte_size_; +} + +uint64_t +PinnedMemoryManager::GetUsedPinnedMemoryByteSize() +{ + std::lock_guard lk(allocated_buffer_mtx_); + uint64_t used_pinned_memory_size = 0; + if (!allocated_pinned_memory_buffers_.empty()) { + for (const auto& it : allocated_pinned_memory_buffers_) { + used_pinned_memory_size += it->GetUsedPinnedMemorySizeInternal(); + } + } + + return used_pinned_memory_size; +} + }} // namespace triton::core diff --git a/src/pinned_memory_manager.h b/src/pinned_memory_manager.h index 993a5fa70..b0883524f 100644 --- a/src/pinned_memory_manager.h +++ b/src/pinned_memory_manager.h @@ -77,13 +77,22 @@ class PinnedMemoryManager { // Return Status object indicating success or failure. static Status Free(void* ptr); + static uint64_t GetTotalPinnedMemoryByteSize(); + static uint64_t GetUsedPinnedMemoryByteSize(); + private: class PinnedMemory { public: PinnedMemory(void* pinned_memory_buffer, uint64_t size); ~PinnedMemory(); + void* Allocate(uint64_t size); + void Deallocate(void* ptr); + uint64_t GetUsedPinnedMemorySizeInternal(); + void* pinned_memory_buffer_; std::mutex buffer_mtx_; + uint64_t used_pinned_memory_byte_size_; + std::map allocated_memory_info_; boost::interprocess::managed_external_buffer managed_pinned_memory_; }; @@ -99,6 +108,9 @@ class PinnedMemoryManager { static std::unique_ptr instance_; static uint64_t pinned_memory_byte_size_; + static std::mutex allocated_buffer_mtx_; + static std::vector> + allocated_pinned_memory_buffers_; std::mutex info_mtx_; std::map> memory_info_; diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt index a0a3177d2..048790dff 100644 --- a/src/test/CMakeLists.txt +++ b/src/test/CMakeLists.txt @@ -449,6 +449,8 @@ if(${TRITON_ENABLE_METRICS}) add_executable( metrics_api_test metrics_api_test.cc + ${PINNED_MEMORY_MANAGER_SRCS} + ${PINNED_MEMORY_MANAGER_HDRS} ../metric_family.cc ../metric_family.h ../metrics.cc @@ -495,6 +497,24 @@ if(${TRITON_ENABLE_METRICS}) protobuf::libprotobuf ) + if (TRITON_ENABLE_GPU) + target_link_libraries( + metrics_api_test + PRIVATE + ${CNMEM_LIBRARY} + CUDA::cudart + ) + endif() + + if (NOT WIN32) + target_link_libraries( + metrics_api_test + PRIVATE + dl + numa + ) + endif() + install( TARGETS metrics_api_test RUNTIME DESTINATION bin