Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cherry pick - Add Pinned Memory metrics (#306) #315

Merged
merged 1 commit into from
Jan 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 63 additions & 3 deletions src/metrics.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
#include <thread>

#include "constants.h"
#include "pinned_memory_manager.h"
#include "prometheus/detail/utils.h"
#include "triton/common/logging.h"

Expand Down Expand Up @@ -108,6 +109,18 @@ Metrics::Metrics()
"execution per-model.")
.Register(*registry_)),

pinned_memory_pool_total_family_(
prometheus::BuildGauge()
.Name("nv_pinned_memory_pool_total_bytes")
.Help("Pinned memory pool total memory size, in bytes")
.Register(*registry_)),

pinned_memory_pool_used_family_(
prometheus::BuildGauge()
.Name("nv_pinned_memory_pool_used_bytes")
.Help("Pinned memory pool used memory size, in bytes")
.Register(*registry_)),

// Per-model cache metric families
// NOTE: These are used in infer_stats for perf_analyzer
cache_num_hits_model_family_(prometheus::BuildCounter()
Expand Down Expand Up @@ -222,7 +235,8 @@ Metrics::Metrics()
#endif // TRITON_ENABLE_METRICS_CPU

metrics_enabled_(false), gpu_metrics_enabled_(false),
cpu_metrics_enabled_(false), metrics_interval_ms_(2000)
cpu_metrics_enabled_(false), pinned_memory_metrics_enabled_(false),
metrics_interval_ms_(2000)
{
}

Expand Down Expand Up @@ -295,8 +309,23 @@ Metrics::EnableMetrics()
{
auto singleton = GetSingleton();
singleton->metrics_enabled_ = true;

EnablePinnedMemoryMetrics();
}

void
Metrics::EnablePinnedMemoryMetrics()
{
auto singleton = GetSingleton();
if (singleton->pinned_memory_metrics_enabled_) {
return;
}
// Initialize
singleton->InitializePinnedMemoryMetrics();
singleton->pinned_memory_metrics_enabled_ = true;
}


void
Metrics::EnableGPUMetrics()
{
Expand Down Expand Up @@ -357,8 +386,10 @@ bool
Metrics::StartPollingThread()
{
// Nothing to poll if no polling metrics enabled, don't spawn a thread
if (!gpu_metrics_enabled_ && !cpu_metrics_enabled_) {
LOG_WARNING << "No polling metrics (CPU, GPU) are enabled. Will not "
if (!gpu_metrics_enabled_ && !cpu_metrics_enabled_ &&
!pinned_memory_metrics_enabled_) {
LOG_WARNING << "No polling metrics (CPU, GPU, Pinned memory) are "
"enabled. Will not "
"poll for them.";
return false;
}
Expand All @@ -372,6 +403,10 @@ Metrics::StartPollingThread()
std::this_thread::sleep_for(
std::chrono::milliseconds(metrics_interval_ms_ / 2));

if (pinned_memory_metrics_enabled_) {
PollPinnedMemoryMetrics();
}

#ifdef TRITON_ENABLE_METRICS_GPU
// Poll DCGM GPU metrics
if (gpu_metrics_enabled_ &&
Expand All @@ -391,6 +426,20 @@ Metrics::StartPollingThread()
return true;
}

bool
Metrics::PollPinnedMemoryMetrics()
{
uint64_t pinned_memory_byte_size =
PinnedMemoryManager::GetTotalPinnedMemoryByteSize();
uint64_t used_pinned_memory_byte_size =
PinnedMemoryManager::GetUsedPinnedMemoryByteSize();

pinned_memory_pool_total_->Set(pinned_memory_byte_size);
pinned_memory_pool_used_->Set(used_pinned_memory_byte_size);

return true;
}

#ifdef TRITON_ENABLE_METRICS_CPU
Status
Metrics::ParseCpuInfo(CpuInfo& info)
Expand Down Expand Up @@ -676,6 +725,17 @@ Metrics::PollDcgmMetrics()
#endif // TRITON_ENABLE_METRICS_GPU
}

bool
Metrics::InitializePinnedMemoryMetrics()
{
const std::map<std::string, std::string> pinned_memory_labels;
pinned_memory_pool_total_ =
&pinned_memory_pool_total_family_.Add(pinned_memory_labels);
pinned_memory_pool_used_ =
&pinned_memory_pool_used_family_.Add(pinned_memory_labels);
return true;
}

bool
Metrics::InitializeCpuMetrics()
{
Expand Down
11 changes: 11 additions & 0 deletions src/metrics.h
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,9 @@ class Metrics {
// Enable reporting of metrics
static void EnableMetrics();

// Enable reporting of Pinned memory metrics
static void EnablePinnedMemoryMetrics();

// Enable reporting of GPU metrics
static void EnableGPUMetrics();

Expand Down Expand Up @@ -270,7 +273,9 @@ class Metrics {
static Metrics* GetSingleton();
bool InitializeDcgmMetrics();
bool InitializeCpuMetrics();
bool InitializePinnedMemoryMetrics();
bool StartPollingThread();
bool PollPinnedMemoryMetrics();
bool PollDcgmMetrics();
bool PollCpuMetrics();

Expand All @@ -295,6 +300,11 @@ class Metrics {
inf_compute_output_duration_us_family_;
prometheus::Family<prometheus::Gauge>& inf_pending_request_count_family_;

prometheus::Family<prometheus::Gauge>& pinned_memory_pool_total_family_;
prometheus::Family<prometheus::Gauge>& pinned_memory_pool_used_family_;
prometheus::Gauge* pinned_memory_pool_total_;
prometheus::Gauge* pinned_memory_pool_used_;

// Per-model Response Cache metrics
// NOTE: Per-model metrics are used in infer_stats for perf_analyzer. Global
// cache metrics will be implemented by cache and published through
Expand Down Expand Up @@ -356,6 +366,7 @@ class Metrics {
bool metrics_enabled_;
bool gpu_metrics_enabled_;
bool cpu_metrics_enabled_;
bool pinned_memory_metrics_enabled_;
bool poll_thread_started_;
std::mutex metrics_enabling_;
std::mutex poll_thread_starting_;
Expand Down
70 changes: 62 additions & 8 deletions src/pinned_memory_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -65,12 +65,16 @@ ParseIntOption(const std::string& msg, const std::string& arg, int* value)
} // namespace

std::unique_ptr<PinnedMemoryManager> PinnedMemoryManager::instance_;
uint64_t PinnedMemoryManager::pinned_memory_byte_size_;
uint64_t PinnedMemoryManager::pinned_memory_byte_size_ = 0;
std::mutex PinnedMemoryManager::allocated_buffer_mtx_;
std::vector<std::shared_ptr<PinnedMemoryManager::PinnedMemory>>
PinnedMemoryManager::allocated_pinned_memory_buffers_;

PinnedMemoryManager::PinnedMemory::PinnedMemory(
void* pinned_memory_buffer, uint64_t size)
: pinned_memory_buffer_(pinned_memory_buffer)
{
used_pinned_memory_byte_size_ = 0;
if (pinned_memory_buffer_ != nullptr) {
managed_pinned_memory_ = boost::interprocess::managed_external_buffer(
boost::interprocess::create_only_t{}, pinned_memory_buffer_, size);
Expand All @@ -87,9 +91,39 @@ PinnedMemoryManager::PinnedMemory::~PinnedMemory()
#endif // TRITON_ENABLE_GPU
}

void*
PinnedMemoryManager::PinnedMemory::Allocate(uint64_t size)
{
std::lock_guard<std::mutex> lk(buffer_mtx_);
void* ptr = managed_pinned_memory_.allocate(size, std::nothrow_t{});
used_pinned_memory_byte_size_ += size;
allocated_memory_info_.emplace(ptr, size);
return ptr;
}

void
PinnedMemoryManager::PinnedMemory::Deallocate(void* ptr)
{
std::lock_guard<std::mutex> lk(buffer_mtx_);
managed_pinned_memory_.deallocate(ptr);
auto it = allocated_memory_info_.find(ptr);
if (it != allocated_memory_info_.end()) {
used_pinned_memory_byte_size_ -= it->second;
allocated_memory_info_.erase(it);
}
}

uint64_t
PinnedMemoryManager::PinnedMemory::GetUsedPinnedMemorySizeInternal()
{
std::lock_guard<std::mutex> lk(buffer_mtx_);
return used_pinned_memory_byte_size_;
}

PinnedMemoryManager::~PinnedMemoryManager()
{
// Clean up
allocated_pinned_memory_buffers_.clear();
for (const auto& memory_info : memory_info_) {
const auto& is_pinned = memory_info.second.first;
if (!is_pinned) {
Expand All @@ -104,6 +138,10 @@ PinnedMemoryManager::AddPinnedMemoryBuffer(
unsigned long node_mask)
{
pinned_memory_buffers_[node_mask] = pinned_memory_buffer;
{
std::lock_guard<std::mutex> lk(allocated_buffer_mtx_);
allocated_pinned_memory_buffers_.push_back(pinned_memory_buffer);
}
}

Status
Expand All @@ -113,9 +151,7 @@ PinnedMemoryManager::AllocInternal(
{
auto status = Status::Success;
if (pinned_memory_buffer->pinned_memory_buffer_ != nullptr) {
std::lock_guard<std::mutex> lk(pinned_memory_buffer->buffer_mtx_);
*ptr = pinned_memory_buffer->managed_pinned_memory_.allocate(
size, std::nothrow_t{});
*ptr = pinned_memory_buffer->Allocate(size);
*allocated_type = TRITONSERVER_MEMORY_CPU_PINNED;
if (*ptr == nullptr) {
status = Status(
Expand Down Expand Up @@ -167,8 +203,7 @@ PinnedMemoryManager::AllocInternal(

if ((!status.IsOk()) && (*ptr != nullptr)) {
if (is_pinned) {
std::lock_guard<std::mutex> lk(pinned_memory_buffer->buffer_mtx_);
pinned_memory_buffer->managed_pinned_memory_.deallocate(*ptr);
pinned_memory_buffer->Deallocate(*ptr);
} else {
free(*ptr);
}
Expand Down Expand Up @@ -201,8 +236,7 @@ PinnedMemoryManager::FreeInternal(void* ptr)
}

if (is_pinned) {
std::lock_guard<std::mutex> lk(pinned_memory_buffer->buffer_mtx_);
pinned_memory_buffer->managed_pinned_memory_.deallocate(ptr);
pinned_memory_buffer->Deallocate(ptr);
} else {
free(ptr);
}
Expand Down Expand Up @@ -376,4 +410,24 @@ PinnedMemoryManager::Free(void* ptr)
return instance_->FreeInternal(ptr);
}

uint64_t
PinnedMemoryManager::GetTotalPinnedMemoryByteSize()
{
return pinned_memory_byte_size_;
}

uint64_t
PinnedMemoryManager::GetUsedPinnedMemoryByteSize()
{
std::lock_guard<std::mutex> lk(allocated_buffer_mtx_);
uint64_t used_pinned_memory_size = 0;
if (!allocated_pinned_memory_buffers_.empty()) {
for (const auto& it : allocated_pinned_memory_buffers_) {
used_pinned_memory_size += it->GetUsedPinnedMemorySizeInternal();
}
}

return used_pinned_memory_size;
}

}} // namespace triton::core
12 changes: 12 additions & 0 deletions src/pinned_memory_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,13 +77,22 @@ class PinnedMemoryManager {
// Return Status object indicating success or failure.
static Status Free(void* ptr);

static uint64_t GetTotalPinnedMemoryByteSize();
static uint64_t GetUsedPinnedMemoryByteSize();

private:
class PinnedMemory {
public:
PinnedMemory(void* pinned_memory_buffer, uint64_t size);
~PinnedMemory();
void* Allocate(uint64_t size);
void Deallocate(void* ptr);
uint64_t GetUsedPinnedMemorySizeInternal();

void* pinned_memory_buffer_;
std::mutex buffer_mtx_;
uint64_t used_pinned_memory_byte_size_;
std::map<void*, uint64_t> allocated_memory_info_;
boost::interprocess::managed_external_buffer managed_pinned_memory_;
};

Expand All @@ -99,6 +108,9 @@ class PinnedMemoryManager {

static std::unique_ptr<PinnedMemoryManager> instance_;
static uint64_t pinned_memory_byte_size_;
static std::mutex allocated_buffer_mtx_;
static std::vector<std::shared_ptr<PinnedMemory>>
allocated_pinned_memory_buffers_;

std::mutex info_mtx_;
std::map<void*, std::pair<bool, PinnedMemory*>> memory_info_;
Expand Down
20 changes: 20 additions & 0 deletions src/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,8 @@ if(${TRITON_ENABLE_METRICS})
add_executable(
metrics_api_test
metrics_api_test.cc
${PINNED_MEMORY_MANAGER_SRCS}
${PINNED_MEMORY_MANAGER_HDRS}
../metric_family.cc
../metric_family.h
../metrics.cc
Expand Down Expand Up @@ -495,6 +497,24 @@ if(${TRITON_ENABLE_METRICS})
protobuf::libprotobuf
)

if (TRITON_ENABLE_GPU)
target_link_libraries(
metrics_api_test
PRIVATE
${CNMEM_LIBRARY}
CUDA::cudart
)
endif()

if (NOT WIN32)
target_link_libraries(
metrics_api_test
PRIVATE
dl
numa
)
endif()

install(
TARGETS metrics_api_test
RUNTIME DESTINATION bin
Expand Down