From 6d6978297a9bf6cdf68e625f7583d415b12abb0b Mon Sep 17 00:00:00 2001 From: Hyunsoo Kim <884273+movence@users.noreply.github.com> Date: Wed, 29 May 2024 11:21:55 -0400 Subject: [PATCH] nvidia gpu count metrics and bugfix (#1183) --- plugins/processors/gpuattributes/processor.go | 13 +- .../gpuattributes/processor_test.go | 236 +- .../emf_and_kubernetes_with_gpu_config.yaml | 2463 +++++++++-------- .../otel/exporter/awsemf/kubernetes.go | 12 +- .../otel/exporter/awsemf/translator_test.go | 10 +- .../metricstransformprocessor/translator.go | 32 +- 6 files changed, 1497 insertions(+), 1269 deletions(-) diff --git a/plugins/processors/gpuattributes/processor.go b/plugins/processors/gpuattributes/processor.go index c62dc7e6b0..94fb411a53 100644 --- a/plugins/processors/gpuattributes/processor.go +++ b/plugins/processors/gpuattributes/processor.go @@ -134,7 +134,7 @@ func (d *gpuAttributesProcessor) processMetrics(_ context.Context, md pmetric.Me ils := ilms.At(j) metrics := ils.Metrics() - d.filterGpuMetricsWithoutPodName(metrics) + d.filterGpuMetricsWithoutPodName(metrics, rs.Resource().Attributes()) metricsLength := metrics.Len() for k := 0; k < metricsLength; k++ { @@ -227,15 +227,15 @@ func (d *gpuAttributesProcessor) filterAttributes(attributes pcommon.Map, labels } // remove dcgm metrics that do not contain PodName attribute which means there is no workload associated to container/pod -func (d *gpuAttributesProcessor) filterGpuMetricsWithoutPodName(metrics pmetric.MetricSlice) { +func (d *gpuAttributesProcessor) filterGpuMetricsWithoutPodName(metrics pmetric.MetricSlice, resourceAttributes pcommon.Map) { metrics.RemoveIf(func(m pmetric.Metric) bool { isGpu := strings.Contains(m.Name(), gpuMetricIdentifier) isContainerOrPod := strings.HasPrefix(m.Name(), gpuContainerMetricPrefix) || strings.HasPrefix(m.Name(), gpuPodMetricPrefix) - if !isGpu || !isContainerOrPod { return false } + _, hasPodAtResource := resourceAttributes.Get(internal.PodName) var dps pmetric.NumberDataPointSlice switch m.Type() { case pmetric.MetricTypeGauge: @@ -246,7 +246,10 @@ func (d *gpuAttributesProcessor) filterGpuMetricsWithoutPodName(metrics pmetric. d.logger.Debug("Ignore unknown metric type", zap.String(containerinsightscommon.MetricType, m.Type().String())) } - _, hasPodInfo := dps.At(0).Attributes().Get(internal.PodName) - return !hasPodInfo + dps.RemoveIf(func(dp pmetric.NumberDataPoint) bool { + _, hasPodInfo := dp.Attributes().Get(internal.PodName) + return !hasPodInfo && !hasPodAtResource + }) + return dps.Len() == 0 }) } diff --git a/plugins/processors/gpuattributes/processor_test.go b/plugins/processors/gpuattributes/processor_test.go index 02bbd02ad7..a625945eda 100644 --- a/plugins/processors/gpuattributes/processor_test.go +++ b/plugins/processors/gpuattributes/processor_test.go @@ -19,93 +19,169 @@ func TestProcessMetrics(t *testing.T) { ctx := context.Background() testcases := map[string]struct { - resource string - metrics pmetric.Metrics - wantCnt int - want map[string]string + resource string + metrics pmetric.Metrics + wantMetricCnt int + want []map[string]string }{ "nonNode": { - metrics: generateMetrics("prefix", map[string]string{ - "ClusterName": "cluster", + metrics: generateMetrics("prefix", []map[string]string{ + { + "ClusterName": "cluster", + }, }), - wantCnt: 1, - want: map[string]string{ - "ClusterName": "cluster", + wantMetricCnt: 1, + want: []map[string]string{ + { + "ClusterName": "cluster", + }, }, }, "nodeDropSimple": { - metrics: generateMetrics("node", map[string]string{ - "ClusterName": "cluster", - "Drop": "val", + metrics: generateMetrics("node", []map[string]string{ + { + "ClusterName": "cluster", + "Drop": "val", + }, }), - wantCnt: 1, - want: map[string]string{ - "ClusterName": "cluster", + wantMetricCnt: 1, + want: []map[string]string{ + { + "ClusterName": "cluster", + }, }, }, "nodeDropJson": { - metrics: generateMetrics("node", map[string]string{ - "ClusterName": "cluster", - "kubernetes": "{\"host\":\"test\"}", + metrics: generateMetrics("node", []map[string]string{ + { + "ClusterName": "cluster", + "kubernetes": "{\"host\":\"test\"}", + }, }), - wantCnt: 1, - want: map[string]string{ - "ClusterName": "cluster", - "kubernetes": "{\"host\":\"test\"}", + wantMetricCnt: 1, + want: []map[string]string{ + { + "ClusterName": "cluster", + "kubernetes": "{\"host\":\"test\"}", + }, }, }, "nodeDropMixed": { - metrics: generateMetrics("node", map[string]string{ - "ClusterName": "cluster", - "Drop": "val", - "kubernetes": "{\"host\":\"test\",\"b\":\"2\"}", + metrics: generateMetrics("node", []map[string]string{ + { + "ClusterName": "cluster", + "Drop": "val", + "kubernetes": "{\"host\":\"test\",\"b\":\"2\"}", + }, }), - wantCnt: 1, - want: map[string]string{ - "ClusterName": "cluster", - "kubernetes": "{\"host\":\"test\"}", + wantMetricCnt: 1, + want: []map[string]string{ + { + "ClusterName": "cluster", + "kubernetes": "{\"host\":\"test\"}", + }, }, }, "dropPodWithoutPodName": { - metrics: generateMetrics("pod", map[string]string{ - "ClusterName": "cluster", - "kubernetes": "{\"host\":\"test\",\"b\":\"2\"}", + metrics: generateMetrics("pod", []map[string]string{ + { + "ClusterName": "cluster", + "kubernetes": "{\"host\":\"test\",\"b\":\"2\"}", + }, }), - wantCnt: 0, - want: map[string]string{}, + wantMetricCnt: 0, + want: []map[string]string{}, }, - "keepPodWithoutPodName": { - metrics: generateMetrics("pod", map[string]string{ - "ClusterName": "cluster", - "PodName": "pod", - "kubernetes": "{\"host\":\"test\",\"b\":\"2\"}", + "keepPodWithPodName": { + metrics: generateMetrics("pod", []map[string]string{ + { + "ClusterName": "cluster", + "PodName": "pod", + "kubernetes": "{\"host\":\"test\",\"b\":\"2\"}", + }, }), - wantCnt: 1, - want: map[string]string{ - "ClusterName": "cluster", - "PodName": "pod", - "kubernetes": "{\"host\":\"test\"}", + wantMetricCnt: 1, + want: []map[string]string{ + { + "ClusterName": "cluster", + "PodName": "pod", + "kubernetes": "{\"host\":\"test\"}", + }, }, }, "dropContainerWithoutPodName": { - metrics: generateMetrics("container", map[string]string{ - "ClusterName": "cluster", - "kubernetes": "{\"host\":\"test\",\"b\":\"2\"}", + metrics: generateMetrics("container", []map[string]string{ + { + "ClusterName": "cluster", + "kubernetes": "{\"host\":\"test\",\"b\":\"2\"}", + }, }), - wantCnt: 0, - want: map[string]string{}, + wantMetricCnt: 0, + want: []map[string]string{}, }, - "keepContainerWithoutPodName": { - metrics: generateMetrics("container", map[string]string{ - "ClusterName": "cluster", - "PodName": "pod", - "kubernetes": "{\"host\":\"test\",\"b\":\"2\"}", + "keepContainerWithPodName": { + metrics: generateMetrics("container", []map[string]string{ + { + "ClusterName": "cluster", + "PodName": "pod", + "kubernetes": "{\"host\":\"test\",\"b\":\"2\"}", + }, }), - wantCnt: 1, - want: map[string]string{ - "ClusterName": "cluster", - "PodName": "pod", - "kubernetes": "{\"host\":\"test\"}", + wantMetricCnt: 1, + want: []map[string]string{ + { + "ClusterName": "cluster", + "PodName": "pod", + "kubernetes": "{\"host\":\"test\"}", + }, + }, + }, + "dropSingleDatapointWithoutPodName": { + metrics: generateMetrics("container", []map[string]string{ + { + "ClusterName": "cluster", + "kubernetes": "{\"host\":\"test\",\"b\":\"2\"}", + }, + { + "ClusterName": "cluster", + "PodName": "pod", + "kubernetes": "{\"host\":\"test\",\"b\":\"2\"}", + }, + }), + wantMetricCnt: 1, + want: []map[string]string{ + { + "ClusterName": "cluster", + "PodName": "pod", + "kubernetes": "{\"host\":\"test\"}", + }, + }, + }, + "keepAllDatapoints": { + metrics: generateMetrics("container", []map[string]string{ + { + "ClusterName": "cluster", + "PodName": "pod1", + "kubernetes": "{\"host\":\"test\",\"b\":\"2\"}", + }, + { + "ClusterName": "cluster", + "PodName": "pod2", + "kubernetes": "{\"host\":\"test\",\"b\":\"2\"}", + }, + }), + wantMetricCnt: 1, + want: []map[string]string{ + { + "ClusterName": "cluster", + "PodName": "pod1", + "kubernetes": "{\"host\":\"test\"}", + }, + { + "ClusterName": "cluster", + "PodName": "pod2", + "kubernetes": "{\"host\":\"test\"}", + }, }, }, } @@ -113,30 +189,34 @@ func TestProcessMetrics(t *testing.T) { for tname, tc := range testcases { fmt.Printf("running %s\n", tname) ms, _ := gp.processMetrics(ctx, tc.metrics) - assert.Equal(t, tc.wantCnt, ms.MetricCount()) - if tc.wantCnt > 0 { - attrs := ms.ResourceMetrics().At(0).ScopeMetrics().At(0).Metrics().At(0).Gauge().DataPoints().At(0).Attributes() - assert.Equal(t, len(tc.want), attrs.Len()) - for k, v := range tc.want { - got, ok := attrs.Get(k) - assert.True(t, ok) - assert.Equal(t, v, got.Str()) + assert.Equal(t, tc.wantMetricCnt, ms.MetricCount()) + if tc.wantMetricCnt > 0 { + dps := ms.ResourceMetrics().At(0).ScopeMetrics().At(0).Metrics().At(0).Gauge().DataPoints() + assert.Equal(t, len(tc.want), dps.Len()) + for i, dim := range tc.want { + attrs := dps.At(i).Attributes() + assert.Equal(t, len(dim), attrs.Len()) + for k, v := range dim { + got, ok := attrs.Get(k) + assert.True(t, ok) + assert.Equal(t, v, got.Str()) + } } } } } -func generateMetrics(prefix string, dimensions map[string]string) pmetric.Metrics { +func generateMetrics(prefix string, dimensions []map[string]string) pmetric.Metrics { md := pmetric.NewMetrics() - - m := md.ResourceMetrics().AppendEmpty().ScopeMetrics().AppendEmpty().Metrics().AppendEmpty() - m.SetName(prefix + gpuMetricIdentifier) - gauge := m.SetEmptyGauge().DataPoints().AppendEmpty() - gauge.SetIntValue(10) - - for k, v := range dimensions { - gauge.Attributes().PutStr(k, v) + ms := md.ResourceMetrics().AppendEmpty().ScopeMetrics().AppendEmpty().Metrics().AppendEmpty() + ms.SetName(prefix + gpuMetricIdentifier) + dps := ms.SetEmptyGauge().DataPoints() + for _, dim := range dimensions { + dp := dps.AppendEmpty() + dp.SetIntValue(10) + for k, v := range dim { + dp.Attributes().PutStr(k, v) + } } - return md } diff --git a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml index f8137e88c9..1be722d577 100644 --- a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml +++ b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml @@ -1,1185 +1,1288 @@ exporters: - awscloudwatchlogs/emf_logs: - certificate_file_path: "" - emf_only: true - endpoint: https://fake_endpoint - imds_retries: 2 - local_mode: true - log_group_name: emf/logs/default - log_retention: 0 - log_stream_name: host_name_from_env - max_retries: 2 - middleware: agenthealth/logs - no_verify_ssl: false - num_workers: 8 - profile: default - proxy_address: "" - raw_log: true - region: us-east-1 - request_timeout_seconds: 30 - resource_arn: "" - retry_on_failure: - enabled: true - initial_interval: 5s - max_elapsed_time: 5m0s - max_interval: 30s - multiplier: 1.5 - randomization_factor: 0.5 - role_arn: "" - sending_queue: - enabled: true - num_consumers: 1 - queue_size: 1000 - shared_credentials_file: - - /root/.aws/credentials - awsemf/containerinsights: - certificate_file_path: "" - detailed_metrics: false - dimension_rollup_option: NoDimensionRollup - disable_metric_extraction: true - eks_fargate_container_insights_enabled: false - endpoint: https://fake_endpoint - enhanced_container_insights: true - imds_retries: 2 - local_mode: true - log_group_name: /aws/containerinsights/{ClusterName}/performance - log_retention: 0 - log_stream_name: '{NodeName}' - max_retries: 2 - metric_declarations: - - dimensions: - - - ClusterName - - - ClusterName - - ContainerName - - FullPodName - - Namespace - - PodName - - - ClusterName - - ContainerName - - Namespace - - PodName - metric_name_selectors: - - container_cpu_utilization - - container_cpu_utilization_over_container_limit - - container_cpu_limit - - container_cpu_request - - container_memory_utilization - - container_memory_utilization_over_container_limit - - container_memory_failures_total - - container_memory_limit - - container_memory_request - - container_filesystem_usage - - container_filesystem_available - - container_filesystem_utilization - - dimensions: - - - ClusterName - - Namespace - - PodName - - - ClusterName - - - ClusterName - - Namespace - - Service - - - ClusterName - - Namespace - - - ClusterName - - FullPodName - - Namespace - - PodName - metric_name_selectors: - - pod_cpu_utilization - - pod_memory_utilization - - pod_network_rx_bytes - - pod_network_tx_bytes - - pod_cpu_utilization_over_pod_limit - - pod_memory_utilization_over_pod_limit - - dimensions: - - - ClusterName - - FullPodName - - Namespace - - PodName - - - ClusterName - - Namespace - - PodName - - - ClusterName - - Namespace - - - ClusterName - metric_name_selectors: - - pod_interface_network_rx_dropped - - pod_interface_network_tx_dropped - - dimensions: - - - ClusterName - - Namespace - - PodName - - - ClusterName - - - ClusterName - - FullPodName - - Namespace - - PodName - - - ClusterName - - Namespace - - Service - metric_name_selectors: - - pod_cpu_reserved_capacity - - pod_memory_reserved_capacity - - pod_number_of_container_restarts - - pod_number_of_containers - - pod_number_of_running_containers - - pod_status_ready - - pod_status_scheduled - - pod_status_running - - pod_status_pending - - pod_status_failed - - pod_status_unknown - - pod_status_succeeded - - pod_memory_request - - pod_memory_limit - - pod_cpu_limit - - pod_cpu_request - - pod_container_status_running - - pod_container_status_terminated - - pod_container_status_waiting - - pod_container_status_waiting_reason_crash_loop_back_off - - pod_container_status_waiting_reason_image_pull_error - - pod_container_status_waiting_reason_start_error - - pod_container_status_waiting_reason_create_container_error - - pod_container_status_waiting_reason_create_container_config_error - - pod_container_status_terminated_reason_oom_killed - - dimensions: - - - ClusterName - - InstanceId - - NodeName - - - ClusterName - metric_name_selectors: - - node_cpu_utilization - - node_memory_utilization - - node_network_total_bytes - - node_cpu_reserved_capacity - - node_memory_reserved_capacity - - node_number_of_running_pods - - node_number_of_running_containers - - node_cpu_usage_total - - node_cpu_limit - - node_memory_working_set - - node_memory_limit - - node_status_condition_ready - - node_status_condition_disk_pressure - - node_status_condition_memory_pressure - - node_status_condition_pid_pressure - - node_status_condition_network_unavailable - - node_status_condition_unknown - - node_status_capacity_pods - - node_status_allocatable_pods - - dimensions: - - - ClusterName - - InstanceId - - NodeName - - - ClusterName - metric_name_selectors: - - node_interface_network_rx_dropped - - node_interface_network_tx_dropped - - node_diskio_io_service_bytes_total - - node_diskio_io_serviced_total - - dimensions: - - - ClusterName - - InstanceId - - NodeName - - - ClusterName - metric_name_selectors: - - node_filesystem_utilization - - node_filesystem_inodes - - node_filesystem_inodes_free - - dimensions: - - - ClusterName - - Namespace - - Service - - - ClusterName - metric_name_selectors: - - service_number_of_running_pods - - dimensions: - - - ClusterName - - Namespace - - PodName - - - ClusterName - metric_name_selectors: - - replicas_desired - - replicas_ready - - status_replicas_available - - status_replicas_unavailable - - dimensions: - - - ClusterName - - Namespace - - PodName - - - ClusterName - metric_name_selectors: - - daemonset_status_number_available - - daemonset_status_number_unavailable - - dimensions: - - - ClusterName - - Namespace - - - ClusterName - metric_name_selectors: - - namespace_number_of_running_pods - - dimensions: - - - ClusterName - metric_name_selectors: - - cluster_node_count - - cluster_failed_node_count - - cluster_number_of_running_pods - - dimensions: - - - ClusterName - - endpoint - - - ClusterName - metric_name_selectors: - - apiserver_storage_size_bytes - - apiserver_storage_db_total_size_in_bytes - - etcd_db_total_size_in_bytes - - dimensions: - - - ClusterName - - resource - - - ClusterName - metric_name_selectors: - - apiserver_storage_list_duration_seconds - - apiserver_longrunning_requests - - apiserver_storage_objects - - dimensions: - - - ClusterName - - verb - - - ClusterName - metric_name_selectors: - - apiserver_request_duration_seconds - - rest_client_request_duration_seconds - - dimensions: - - - ClusterName - - code - - verb - - - ClusterName - metric_name_selectors: - - apiserver_request_total - - apiserver_request_total_5xx - - dimensions: - - - ClusterName - - operation - - - ClusterName - metric_name_selectors: - - apiserver_admission_controller_admission_duration_seconds - - apiserver_admission_step_admission_duration_seconds - - etcd_request_duration_seconds - - dimensions: - - - ClusterName - - code - - method - - - ClusterName - metric_name_selectors: - - rest_client_requests_total - - dimensions: - - - ClusterName - - request_kind - - - ClusterName - metric_name_selectors: - - apiserver_current_inflight_requests - - apiserver_current_inqueue_requests - - dimensions: - - - ClusterName - - name - - - ClusterName - metric_name_selectors: - - apiserver_admission_webhook_admission_duration_seconds - - dimensions: - - - ClusterName - - group - - - ClusterName - metric_name_selectors: - - apiserver_requested_deprecated_apis - - dimensions: - - - ClusterName - - reason - - - ClusterName - metric_name_selectors: - - apiserver_flowcontrol_rejected_requests_total - - dimensions: - - - ClusterName - - priority_level - - - ClusterName - metric_name_selectors: - - apiserver_flowcontrol_request_concurrency_limit - - dimensions: - - - ClusterName - - - ClusterName - - ContainerName - - Namespace - - PodName - - - ClusterName - - ContainerName - - FullPodName - - Namespace - - PodName - - - ClusterName - - ContainerName - - FullPodName - - GpuDevice - - Namespace - - PodName - metric_name_selectors: - - container_gpu_utilization - - container_gpu_memory_utilization - - container_gpu_memory_total - - container_gpu_memory_used - - container_gpu_power_draw - - container_gpu_temperature - - dimensions: - - - ClusterName - - - ClusterName - - Namespace - - - ClusterName - - Namespace - - Service - - - ClusterName - - Namespace - - PodName - - - ClusterName - - FullPodName - - Namespace - - PodName - - - ClusterName - - FullPodName - - GpuDevice - - Namespace - - PodName - metric_name_selectors: - - pod_gpu_utilization - - pod_gpu_memory_utilization - - pod_gpu_memory_total - - pod_gpu_memory_used - - pod_gpu_power_draw - - pod_gpu_temperature - - dimensions: - - - ClusterName - - - ClusterName - - InstanceId - - NodeName - - - ClusterName - - GpuDevice - - InstanceId - - InstanceType - - NodeName - metric_name_selectors: - - node_gpu_utilization - - node_gpu_memory_utilization - - node_gpu_memory_total - - node_gpu_memory_used - - node_gpu_power_draw - - node_gpu_temperature - - dimensions: - - - ClusterName - - InstanceId - - NodeName - - - ClusterName - metric_name_selectors: - - node_gpu_total - - node_gpu_request - - node_gpu_limit - - dimensions: - - - ClusterName - metric_name_selectors: - - cluster_gpu_request - - cluster_gpu_total - - dimensions: - - - ClusterName - - - ClusterName - - ContainerName - - Namespace - - PodName - - - ClusterName - - ContainerName - - FullPodName - - Namespace - - PodName - - - ClusterName - - ContainerName - - FullPodName - - Namespace - - NeuronCore - - NeuronDevice - - PodName - metric_name_selectors: - - container_neuroncore_utilization - - container_neuroncore_memory_usage_total - - container_neuroncore_memory_usage_constants - - container_neuroncore_memory_usage_model_code - - container_neuroncore_memory_usage_model_shared_scratchpad - - container_neuroncore_memory_usage_runtime_memory - - container_neuroncore_memory_usage_tensors - - dimensions: - - - ClusterName - - - ClusterName - - ContainerName - - Namespace - - PodName - - - ClusterName - - ContainerName - - FullPodName - - Namespace - - PodName - - - ClusterName - - ContainerName - - FullPodName - - Namespace - - NeuronDevice - - PodName - metric_name_selectors: - - container_neurondevice_hw_ecc_events_total - - dimensions: - - - ClusterName - - - ClusterName - - Namespace - - - ClusterName - - Namespace - - Service - - - ClusterName - - Namespace - - PodName - - - ClusterName - - FullPodName - - Namespace - - PodName - - - ClusterName - - FullPodName - - Namespace - - NeuronCore - - NeuronDevice - - PodName - metric_name_selectors: - - pod_neuroncore_utilization - - pod_neuroncore_memory_usage_total - - pod_neuroncore_memory_usage_constants - - pod_neuroncore_memory_usage_model_code - - pod_neuroncore_memory_usage_model_shared_scratchpad - - pod_neuroncore_memory_usage_runtime_memory - - pod_neuroncore_memory_usage_tensors - - dimensions: - - - ClusterName - - - ClusterName - - Namespace - - - ClusterName - - Namespace - - Service - - - ClusterName - - Namespace - - PodName - - - ClusterName - - FullPodName - - Namespace - - PodName - - - ClusterName - - FullPodName - - Namespace - - NeuronDevice - - PodName - metric_name_selectors: - - pod_neurondevice_hw_ecc_events_total - - dimensions: - - - ClusterName - - - ClusterName - - InstanceId - - NodeName - - - ClusterName - - InstanceId - - InstanceType - - NeuronCore - - NeuronDevice - - NodeName - metric_name_selectors: - - node_neuroncore_utilization - - node_neuroncore_memory_usage_total - - node_neuroncore_memory_usage_constants - - node_neuroncore_memory_usage_model_code - - node_neuroncore_memory_usage_model_shared_scratchpad - - node_neuroncore_memory_usage_runtime_memory - - node_neuroncore_memory_usage_tensors - - dimensions: - - - ClusterName - - - ClusterName - - InstanceId - - NodeName - metric_name_selectors: - - node_neuron_execution_errors_total - - node_neurondevice_runtime_memory_used_bytes - - node_neuron_execution_latency - - dimensions: - - - ClusterName - - - ClusterName - - InstanceId - - NodeName - - - ClusterName - - InstanceId - - NeuronDevice - - NodeName - metric_name_selectors: - - node_neurondevice_hw_ecc_events_total - - dimensions: - - - ClusterName - - - ClusterName - - ContainerName - - Namespace - - PodName - - - ClusterName - - ContainerName - - FullPodName - - Namespace - - PodName - metric_name_selectors: - - container_efa_rx_bytes - - container_efa_tx_bytes - - container_efa_rx_dropped - - container_efa_rdma_read_bytes - - container_efa_rdma_write_bytes - - container_efa_rdma_write_recv_bytes - - dimensions: - - - ClusterName - - - ClusterName - - Namespace - - - ClusterName - - Namespace - - Service - - - ClusterName - - Namespace - - PodName - - - ClusterName - - FullPodName - - Namespace - - PodName - metric_name_selectors: - - pod_efa_rx_bytes - - pod_efa_tx_bytes - - pod_efa_rx_dropped - - pod_efa_rdma_read_bytes - - pod_efa_rdma_write_bytes - - pod_efa_rdma_write_recv_bytes - - dimensions: - - - ClusterName - - - ClusterName - - InstanceId - - NodeName - metric_name_selectors: - - node_efa_rx_bytes - - node_efa_tx_bytes - - node_efa_rx_dropped - - node_efa_rdma_read_bytes - - node_efa_rdma_write_bytes - - node_efa_rdma_write_recv_bytes - metric_descriptors: - - metric_name: apiserver_admission_controller_admission_duration_seconds - overwrite: true - unit: Seconds - - metric_name: apiserver_admission_step_admission_duration_seconds - overwrite: true - unit: Seconds - - metric_name: apiserver_admission_webhook_admission_duration_seconds - overwrite: true - unit: Seconds - - metric_name: apiserver_current_inflight_requests - overwrite: true - unit: Count - - metric_name: apiserver_current_inqueue_requests - overwrite: true - unit: Count - - metric_name: apiserver_flowcontrol_rejected_requests_total - overwrite: true - unit: Count - - metric_name: apiserver_flowcontrol_request_concurrency_limit - overwrite: true - unit: Count - - metric_name: apiserver_longrunning_requests - overwrite: true - unit: Count - - metric_name: apiserver_request_duration_seconds - overwrite: true - unit: Seconds - - metric_name: apiserver_request_total - overwrite: true - unit: Count - - metric_name: apiserver_request_total_5xx - overwrite: true - unit: Count - - metric_name: apiserver_requested_deprecated_apis - overwrite: true - unit: Count - - metric_name: apiserver_storage_objects - overwrite: true - unit: Count - - metric_name: etcd_request_duration_seconds - overwrite: true - unit: Seconds - - metric_name: apiserver_storage_list_duration_seconds - overwrite: true - unit: Seconds - - metric_name: apiserver_storage_db_total_size_in_bytes - overwrite: true - unit: Bytes - - metric_name: apiserver_storage_size_bytes - overwrite: true - unit: Bytes - - metric_name: etcd_db_total_size_in_bytes - overwrite: true - unit: Bytes - - metric_name: rest_client_request_duration_seconds - overwrite: true - unit: Seconds - - metric_name: rest_client_requests_total - overwrite: true - unit: Count - middleware: agenthealth/logs - namespace: ContainerInsights - no_verify_ssl: false - num_workers: 8 - output_destination: cloudwatch - parse_json_encoded_attr_values: - - Sources - - kubernetes - profile: default - proxy_address: "" - region: us-east-1 - request_timeout_seconds: 30 - resource_arn: "" - resource_to_telemetry_conversion: - enabled: true - retain_initial_value_of_delta_metric: false - role_arn: "" - shared_credentials_file: - - /root/.aws/credentials - version: "0" + awscloudwatchlogs/emf_logs: + certificate_file_path: "" + emf_only: true + endpoint: https://fake_endpoint + imds_retries: 2 + local_mode: true + log_group_name: emf/logs/default + log_retention: 0 + log_stream_name: host_name_from_env + max_retries: 2 + middleware: agenthealth/logs + no_verify_ssl: false + num_workers: 8 + profile: default + proxy_address: "" + raw_log: true + region: us-east-1 + request_timeout_seconds: 30 + resource_arn: "" + retry_on_failure: + enabled: true + initial_interval: 5s + max_elapsed_time: 5m0s + max_interval: 30s + multiplier: 1.5 + randomization_factor: 0.5 + role_arn: "" + sending_queue: + enabled: true + num_consumers: 1 + queue_size: 1000 + shared_credentials_file: + - /root/.aws/credentials + awsemf/containerinsights: + certificate_file_path: "" + detailed_metrics: false + dimension_rollup_option: NoDimensionRollup + disable_metric_extraction: true + eks_fargate_container_insights_enabled: false + endpoint: https://fake_endpoint + enhanced_container_insights: true + imds_retries: 2 + local_mode: true + log_group_name: /aws/containerinsights/{ClusterName}/performance + log_retention: 0 + log_stream_name: '{NodeName}' + max_retries: 2 + metric_declarations: + - dimensions: + - - ClusterName + - - ClusterName + - ContainerName + - FullPodName + - Namespace + - PodName + - - ClusterName + - ContainerName + - Namespace + - PodName + metric_name_selectors: + - container_cpu_utilization + - container_cpu_utilization_over_container_limit + - container_cpu_limit + - container_cpu_request + - container_memory_utilization + - container_memory_utilization_over_container_limit + - container_memory_failures_total + - container_memory_limit + - container_memory_request + - container_filesystem_usage + - container_filesystem_available + - container_filesystem_utilization + - dimensions: + - - ClusterName + - Namespace + - PodName + - - ClusterName + - - ClusterName + - Namespace + - Service + - - ClusterName + - Namespace + - - ClusterName + - FullPodName + - Namespace + - PodName + metric_name_selectors: + - pod_cpu_utilization + - pod_memory_utilization + - pod_network_rx_bytes + - pod_network_tx_bytes + - pod_cpu_utilization_over_pod_limit + - pod_memory_utilization_over_pod_limit + - dimensions: + - - ClusterName + - FullPodName + - Namespace + - PodName + - - ClusterName + - Namespace + - PodName + - - ClusterName + - Namespace + - - ClusterName + metric_name_selectors: + - pod_interface_network_rx_dropped + - pod_interface_network_tx_dropped + - dimensions: + - - ClusterName + - Namespace + - PodName + - - ClusterName + - - ClusterName + - FullPodName + - Namespace + - PodName + - - ClusterName + - Namespace + - Service + metric_name_selectors: + - pod_cpu_reserved_capacity + - pod_memory_reserved_capacity + - pod_number_of_container_restarts + - pod_number_of_containers + - pod_number_of_running_containers + - pod_status_ready + - pod_status_scheduled + - pod_status_running + - pod_status_pending + - pod_status_failed + - pod_status_unknown + - pod_status_succeeded + - pod_memory_request + - pod_memory_limit + - pod_cpu_limit + - pod_cpu_request + - pod_container_status_running + - pod_container_status_terminated + - pod_container_status_waiting + - pod_container_status_waiting_reason_crash_loop_back_off + - pod_container_status_waiting_reason_image_pull_error + - pod_container_status_waiting_reason_start_error + - pod_container_status_waiting_reason_create_container_error + - pod_container_status_waiting_reason_create_container_config_error + - pod_container_status_terminated_reason_oom_killed + - dimensions: + - - ClusterName + - InstanceId + - NodeName + - - ClusterName + metric_name_selectors: + - node_cpu_utilization + - node_memory_utilization + - node_network_total_bytes + - node_cpu_reserved_capacity + - node_memory_reserved_capacity + - node_number_of_running_pods + - node_number_of_running_containers + - node_cpu_usage_total + - node_cpu_limit + - node_memory_working_set + - node_memory_limit + - node_status_condition_ready + - node_status_condition_disk_pressure + - node_status_condition_memory_pressure + - node_status_condition_pid_pressure + - node_status_condition_network_unavailable + - node_status_condition_unknown + - node_status_capacity_pods + - node_status_allocatable_pods + - dimensions: + - - ClusterName + - InstanceId + - NodeName + - - ClusterName + metric_name_selectors: + - node_interface_network_rx_dropped + - node_interface_network_tx_dropped + - node_diskio_io_service_bytes_total + - node_diskio_io_serviced_total + - dimensions: + - - ClusterName + - InstanceId + - NodeName + - - ClusterName + metric_name_selectors: + - node_filesystem_utilization + - node_filesystem_inodes + - node_filesystem_inodes_free + - dimensions: + - - ClusterName + - Namespace + - Service + - - ClusterName + metric_name_selectors: + - service_number_of_running_pods + - dimensions: + - - ClusterName + - Namespace + - PodName + - - ClusterName + metric_name_selectors: + - replicas_desired + - replicas_ready + - status_replicas_available + - status_replicas_unavailable + - dimensions: + - - ClusterName + - Namespace + - PodName + - - ClusterName + metric_name_selectors: + - daemonset_status_number_available + - daemonset_status_number_unavailable + - dimensions: + - - ClusterName + - Namespace + - - ClusterName + metric_name_selectors: + - namespace_number_of_running_pods + - dimensions: + - - ClusterName + metric_name_selectors: + - cluster_node_count + - cluster_failed_node_count + - cluster_number_of_running_pods + - dimensions: + - - ClusterName + - endpoint + - - ClusterName + metric_name_selectors: + - apiserver_storage_size_bytes + - apiserver_storage_db_total_size_in_bytes + - etcd_db_total_size_in_bytes + - dimensions: + - - ClusterName + - resource + - - ClusterName + metric_name_selectors: + - apiserver_storage_list_duration_seconds + - apiserver_longrunning_requests + - apiserver_storage_objects + - dimensions: + - - ClusterName + - verb + - - ClusterName + metric_name_selectors: + - apiserver_request_duration_seconds + - rest_client_request_duration_seconds + - dimensions: + - - ClusterName + - code + - verb + - - ClusterName + metric_name_selectors: + - apiserver_request_total + - apiserver_request_total_5xx + - dimensions: + - - ClusterName + - operation + - - ClusterName + metric_name_selectors: + - apiserver_admission_controller_admission_duration_seconds + - apiserver_admission_step_admission_duration_seconds + - etcd_request_duration_seconds + - dimensions: + - - ClusterName + - code + - method + - - ClusterName + metric_name_selectors: + - rest_client_requests_total + - dimensions: + - - ClusterName + - request_kind + - - ClusterName + metric_name_selectors: + - apiserver_current_inflight_requests + - apiserver_current_inqueue_requests + - dimensions: + - - ClusterName + - name + - - ClusterName + metric_name_selectors: + - apiserver_admission_webhook_admission_duration_seconds + - dimensions: + - - ClusterName + - group + - - ClusterName + metric_name_selectors: + - apiserver_requested_deprecated_apis + - dimensions: + - - ClusterName + - reason + - - ClusterName + metric_name_selectors: + - apiserver_flowcontrol_rejected_requests_total + - dimensions: + - - ClusterName + - priority_level + - - ClusterName + metric_name_selectors: + - apiserver_flowcontrol_request_concurrency_limit + - dimensions: + - - ClusterName + - - ClusterName + - ContainerName + - Namespace + - PodName + - - ClusterName + - ContainerName + - FullPodName + - Namespace + - PodName + - - ClusterName + - ContainerName + - FullPodName + - GpuDevice + - Namespace + - PodName + metric_name_selectors: + - container_gpu_utilization + - container_gpu_memory_utilization + - container_gpu_memory_total + - container_gpu_memory_used + - container_gpu_power_draw + - container_gpu_temperature + - dimensions: + - - ClusterName + - - ClusterName + - Namespace + - - ClusterName + - Namespace + - Service + - - ClusterName + - Namespace + - PodName + - - ClusterName + - FullPodName + - Namespace + - PodName + - - ClusterName + - FullPodName + - GpuDevice + - Namespace + - PodName + metric_name_selectors: + - pod_gpu_utilization + - pod_gpu_memory_utilization + - pod_gpu_memory_total + - pod_gpu_memory_used + - pod_gpu_power_draw + - pod_gpu_temperature + - dimensions: + - - ClusterName + - - ClusterName + - InstanceId + - NodeName + - - ClusterName + - GpuDevice + - InstanceId + - InstanceType + - NodeName + metric_name_selectors: + - node_gpu_utilization + - node_gpu_memory_utilization + - node_gpu_memory_total + - node_gpu_memory_used + - node_gpu_power_draw + - node_gpu_temperature + - dimensions: + - - ClusterName + - - ClusterName + - Namespace + - - ClusterName + - Namespace + - Service + - - ClusterName + - Namespace + - PodName + - - ClusterName + - FullPodName + - Namespace + - PodName + metric_name_selectors: + - pod_gpu_total + - pod_gpu_request + - pod_gpu_limit + - dimensions: + - - ClusterName + - - ClusterName + - InstanceId + - InstanceType + - NodeName + metric_name_selectors: + - node_gpu_total + - node_gpu_request + - node_gpu_limit + - dimensions: + - - ClusterName + metric_name_selectors: + - cluster_gpu_total + - cluster_gpu_request + - dimensions: + - - ClusterName + - - ClusterName + - ContainerName + - Namespace + - PodName + - - ClusterName + - ContainerName + - FullPodName + - Namespace + - PodName + - - ClusterName + - ContainerName + - FullPodName + - Namespace + - NeuronCore + - NeuronDevice + - PodName + metric_name_selectors: + - container_neuroncore_utilization + - container_neuroncore_memory_usage_total + - container_neuroncore_memory_usage_constants + - container_neuroncore_memory_usage_model_code + - container_neuroncore_memory_usage_model_shared_scratchpad + - container_neuroncore_memory_usage_runtime_memory + - container_neuroncore_memory_usage_tensors + - dimensions: + - - ClusterName + - - ClusterName + - ContainerName + - Namespace + - PodName + - - ClusterName + - ContainerName + - FullPodName + - Namespace + - PodName + - - ClusterName + - ContainerName + - FullPodName + - Namespace + - NeuronDevice + - PodName + metric_name_selectors: + - container_neurondevice_hw_ecc_events_total + - dimensions: + - - ClusterName + - - ClusterName + - Namespace + - - ClusterName + - Namespace + - Service + - - ClusterName + - Namespace + - PodName + - - ClusterName + - FullPodName + - Namespace + - PodName + - - ClusterName + - FullPodName + - Namespace + - NeuronCore + - NeuronDevice + - PodName + metric_name_selectors: + - pod_neuroncore_utilization + - pod_neuroncore_memory_usage_total + - pod_neuroncore_memory_usage_constants + - pod_neuroncore_memory_usage_model_code + - pod_neuroncore_memory_usage_model_shared_scratchpad + - pod_neuroncore_memory_usage_runtime_memory + - pod_neuroncore_memory_usage_tensors + - dimensions: + - - ClusterName + - - ClusterName + - Namespace + - - ClusterName + - Namespace + - Service + - - ClusterName + - Namespace + - PodName + - - ClusterName + - FullPodName + - Namespace + - PodName + - - ClusterName + - FullPodName + - Namespace + - NeuronDevice + - PodName + metric_name_selectors: + - pod_neurondevice_hw_ecc_events_total + - dimensions: + - - ClusterName + - - ClusterName + - InstanceId + - NodeName + - - ClusterName + - InstanceId + - InstanceType + - NeuronCore + - NeuronDevice + - NodeName + metric_name_selectors: + - node_neuroncore_utilization + - node_neuroncore_memory_usage_total + - node_neuroncore_memory_usage_constants + - node_neuroncore_memory_usage_model_code + - node_neuroncore_memory_usage_model_shared_scratchpad + - node_neuroncore_memory_usage_runtime_memory + - node_neuroncore_memory_usage_tensors + - dimensions: + - - ClusterName + - - ClusterName + - InstanceId + - NodeName + metric_name_selectors: + - node_neuron_execution_errors_total + - node_neurondevice_runtime_memory_used_bytes + - node_neuron_execution_latency + - dimensions: + - - ClusterName + - - ClusterName + - InstanceId + - NodeName + - - ClusterName + - InstanceId + - NeuronDevice + - NodeName + metric_name_selectors: + - node_neurondevice_hw_ecc_events_total + - dimensions: + - - ClusterName + - - ClusterName + - ContainerName + - Namespace + - PodName + - - ClusterName + - ContainerName + - FullPodName + - Namespace + - PodName + metric_name_selectors: + - container_efa_rx_bytes + - container_efa_tx_bytes + - container_efa_rx_dropped + - container_efa_rdma_read_bytes + - container_efa_rdma_write_bytes + - container_efa_rdma_write_recv_bytes + - dimensions: + - - ClusterName + - - ClusterName + - Namespace + - - ClusterName + - Namespace + - Service + - - ClusterName + - Namespace + - PodName + - - ClusterName + - FullPodName + - Namespace + - PodName + metric_name_selectors: + - pod_efa_rx_bytes + - pod_efa_tx_bytes + - pod_efa_rx_dropped + - pod_efa_rdma_read_bytes + - pod_efa_rdma_write_bytes + - pod_efa_rdma_write_recv_bytes + - dimensions: + - - ClusterName + - - ClusterName + - InstanceId + - NodeName + metric_name_selectors: + - node_efa_rx_bytes + - node_efa_tx_bytes + - node_efa_rx_dropped + - node_efa_rdma_read_bytes + - node_efa_rdma_write_bytes + - node_efa_rdma_write_recv_bytes + metric_descriptors: + - metric_name: apiserver_admission_controller_admission_duration_seconds + overwrite: true + unit: Seconds + - metric_name: apiserver_admission_step_admission_duration_seconds + overwrite: true + unit: Seconds + - metric_name: apiserver_admission_webhook_admission_duration_seconds + overwrite: true + unit: Seconds + - metric_name: apiserver_current_inflight_requests + overwrite: true + unit: Count + - metric_name: apiserver_current_inqueue_requests + overwrite: true + unit: Count + - metric_name: apiserver_flowcontrol_rejected_requests_total + overwrite: true + unit: Count + - metric_name: apiserver_flowcontrol_request_concurrency_limit + overwrite: true + unit: Count + - metric_name: apiserver_longrunning_requests + overwrite: true + unit: Count + - metric_name: apiserver_request_duration_seconds + overwrite: true + unit: Seconds + - metric_name: apiserver_request_total + overwrite: true + unit: Count + - metric_name: apiserver_request_total_5xx + overwrite: true + unit: Count + - metric_name: apiserver_requested_deprecated_apis + overwrite: true + unit: Count + - metric_name: apiserver_storage_objects + overwrite: true + unit: Count + - metric_name: etcd_request_duration_seconds + overwrite: true + unit: Seconds + - metric_name: apiserver_storage_list_duration_seconds + overwrite: true + unit: Seconds + - metric_name: apiserver_storage_db_total_size_in_bytes + overwrite: true + unit: Bytes + - metric_name: apiserver_storage_size_bytes + overwrite: true + unit: Bytes + - metric_name: etcd_db_total_size_in_bytes + overwrite: true + unit: Bytes + - metric_name: rest_client_request_duration_seconds + overwrite: true + unit: Seconds + - metric_name: rest_client_requests_total + overwrite: true + unit: Count + middleware: agenthealth/logs + namespace: ContainerInsights + no_verify_ssl: false + num_workers: 8 + output_destination: cloudwatch + parse_json_encoded_attr_values: + - Sources + - kubernetes + profile: default + proxy_address: "" + region: us-east-1 + request_timeout_seconds: 30 + resource_arn: "" + resource_to_telemetry_conversion: + enabled: true + retain_initial_value_of_delta_metric: false + role_arn: "" + shared_credentials_file: + - /root/.aws/credentials + version: "0" extensions: - agenthealth/logs: - is_usage_data_enabled: true - stats: - operations: - - PutLogEvents - usage_flags: - mode: OP - region_type: ACJ + agenthealth/logs: + is_usage_data_enabled: true + stats: + operations: + - PutLogEvents + usage_flags: + mode: OP + region_type: ACJ processors: - batch/containerinsights: - metadata_cardinality_limit: 1000 - send_batch_max_size: 0 - send_batch_size: 8192 - timeout: 5s - batch/emf_logs: - metadata_cardinality_limit: 1000 - send_batch_max_size: 0 - send_batch_size: 8192 - timeout: 5s - gpuattributes/containerinsights: {} - metricstransform/containerinsights: - transforms: - - action: insert - aggregation_type: "" - experimental_match_labels: - code: ^5.* - include: apiserver_request_total - match_type: regexp - new_name: apiserver_request_total_5xx - submatch_case: "" - - action: insert - aggregation_type: "" - include: DCGM_FI_DEV_FB_USED_PERCENT - match_type: "" - new_name: container_gpu_memory_utilization - operations: - - action: add_label - aggregation_type: "" - experimental_scale: 0 - label: "" - label_value: "" - new_label: Type - new_value: ContainerGPU - - action: experimental_scale_value - aggregation_type: "" - experimental_scale: 100 - label: "" - label_value: "" - new_label: "" - new_value: "" - submatch_case: "" - - action: insert - aggregation_type: "" - include: DCGM_FI_DEV_FB_USED_PERCENT - match_type: "" - new_name: pod_gpu_memory_utilization - operations: - - action: add_label - aggregation_type: "" - experimental_scale: 0 - label: "" - label_value: "" - new_label: Type - new_value: PodGPU - - action: experimental_scale_value - aggregation_type: "" - experimental_scale: 100 - label: "" - label_value: "" - new_label: "" - new_value: "" - submatch_case: "" - - action: insert - aggregation_type: "" - include: DCGM_FI_DEV_FB_USED_PERCENT - match_type: "" - new_name: node_gpu_memory_utilization - operations: - - action: add_label - aggregation_type: "" - experimental_scale: 0 - label: "" - label_value: "" - new_label: Type - new_value: NodeGPU - - action: experimental_scale_value - aggregation_type: "" - experimental_scale: 100 - label: "" - label_value: "" - new_label: "" - new_value: "" - submatch_case: "" - - action: insert - aggregation_type: "" - include: DCGM_FI_DEV_FB_USED - match_type: "" - new_name: container_gpu_memory_used - operations: - - action: add_label - aggregation_type: "" - experimental_scale: 0 - label: "" - label_value: "" - new_label: Type - new_value: ContainerGPU - - action: experimental_scale_value - aggregation_type: "" - experimental_scale: 1.048576e+06 - label: "" - label_value: "" - new_label: "" - new_value: "" - submatch_case: "" - - action: insert - aggregation_type: "" - include: DCGM_FI_DEV_FB_USED - match_type: "" - new_name: pod_gpu_memory_used - operations: - - action: add_label - aggregation_type: "" - experimental_scale: 0 - label: "" - label_value: "" - new_label: Type - new_value: PodGPU - - action: experimental_scale_value - aggregation_type: "" - experimental_scale: 1.048576e+06 - label: "" - label_value: "" - new_label: "" - new_value: "" - submatch_case: "" - - action: insert - aggregation_type: "" - include: DCGM_FI_DEV_FB_USED - match_type: "" - new_name: node_gpu_memory_used - operations: - - action: add_label - aggregation_type: "" - experimental_scale: 0 - label: "" - label_value: "" - new_label: Type - new_value: NodeGPU - - action: experimental_scale_value - aggregation_type: "" - experimental_scale: 1.048576e+06 - label: "" - label_value: "" - new_label: "" - new_value: "" - submatch_case: "" - - action: insert - aggregation_type: "" - include: DCGM_FI_DEV_FB_TOTAL - match_type: "" - new_name: container_gpu_memory_total - operations: - - action: add_label - aggregation_type: "" - experimental_scale: 0 - label: "" - label_value: "" - new_label: Type - new_value: ContainerGPU - - action: experimental_scale_value - aggregation_type: "" - experimental_scale: 1.048576e+06 - label: "" - label_value: "" - new_label: "" - new_value: "" - submatch_case: "" - - action: insert - aggregation_type: "" - include: DCGM_FI_DEV_FB_TOTAL - match_type: "" - new_name: pod_gpu_memory_total - operations: - - action: add_label - aggregation_type: "" - experimental_scale: 0 - label: "" - label_value: "" - new_label: Type - new_value: PodGPU - - action: experimental_scale_value - aggregation_type: "" - experimental_scale: 1.048576e+06 - label: "" - label_value: "" - new_label: "" - new_value: "" - submatch_case: "" - - action: insert - aggregation_type: "" - include: DCGM_FI_DEV_FB_TOTAL - match_type: "" - new_name: node_gpu_memory_total - operations: - - action: add_label - aggregation_type: "" - experimental_scale: 0 - label: "" - label_value: "" - new_label: Type - new_value: NodeGPU - - action: experimental_scale_value - aggregation_type: "" - experimental_scale: 1.048576e+06 - label: "" - label_value: "" - new_label: "" - new_value: "" - submatch_case: "" - - action: insert - aggregation_type: "" - include: DCGM_FI_DEV_GPU_TEMP - match_type: "" - new_name: container_gpu_temperature - operations: - - action: add_label - aggregation_type: "" - experimental_scale: 0 - label: "" - label_value: "" - new_label: Type - new_value: ContainerGPU - submatch_case: "" - - action: insert - aggregation_type: "" - include: DCGM_FI_DEV_GPU_TEMP - match_type: "" - new_name: pod_gpu_temperature - operations: - - action: add_label - aggregation_type: "" - experimental_scale: 0 - label: "" - label_value: "" - new_label: Type - new_value: PodGPU - submatch_case: "" - - action: insert - aggregation_type: "" - include: DCGM_FI_DEV_GPU_TEMP - match_type: "" - new_name: node_gpu_temperature - operations: - - action: add_label - aggregation_type: "" - experimental_scale: 0 - label: "" - label_value: "" - new_label: Type - new_value: NodeGPU - submatch_case: "" - - action: insert - aggregation_type: "" - include: DCGM_FI_DEV_POWER_USAGE - match_type: "" - new_name: container_gpu_power_draw - operations: - - action: add_label - aggregation_type: "" - experimental_scale: 0 - label: "" - label_value: "" - new_label: Type - new_value: ContainerGPU - submatch_case: "" - - action: insert - aggregation_type: "" - include: DCGM_FI_DEV_POWER_USAGE - match_type: "" - new_name: pod_gpu_power_draw - operations: - - action: add_label - aggregation_type: "" - experimental_scale: 0 - label: "" - label_value: "" - new_label: Type - new_value: PodGPU - submatch_case: "" - - action: insert - aggregation_type: "" - include: DCGM_FI_DEV_POWER_USAGE - match_type: "" - new_name: node_gpu_power_draw - operations: - - action: add_label - aggregation_type: "" - experimental_scale: 0 - label: "" - label_value: "" - new_label: Type - new_value: NodeGPU - submatch_case: "" - - action: insert - aggregation_type: "" - include: DCGM_FI_DEV_GPU_UTIL - match_type: "" - new_name: container_gpu_utilization - operations: - - action: add_label - aggregation_type: "" - experimental_scale: 0 - label: "" - label_value: "" - new_label: Type - new_value: ContainerGPU - submatch_case: "" - - action: insert - aggregation_type: "" - include: DCGM_FI_DEV_GPU_UTIL - match_type: "" - new_name: pod_gpu_utilization - operations: - - action: add_label - aggregation_type: "" - experimental_scale: 0 - label: "" - label_value: "" - new_label: Type - new_value: PodGPU - submatch_case: "" - - action: insert - aggregation_type: "" - include: DCGM_FI_DEV_GPU_UTIL - match_type: "" - new_name: node_gpu_utilization - operations: - - action: add_label - aggregation_type: "" - experimental_scale: 0 - label: "" - label_value: "" - new_label: Type - new_value: NodeGPU - submatch_case: "" - - action: update - aggregation_type: "" - include: neuroncore_memory_usage_model_shared_scratchpad - match_type: "" - new_name: neuroncore_memory_usage_model_shared_scratchpad - operations: [] - submatch_case: "" - - action: update - aggregation_type: "" - include: neuroncore_memory_usage_tensors - match_type: "" - new_name: neuroncore_memory_usage_tensors - operations: [] - submatch_case: "" - - action: update - aggregation_type: "" - include: hardware_ecc_events_total - match_type: "" - new_name: neurondevice_hw_ecc_events - operations: [] - submatch_case: "" - - action: update - aggregation_type: "" - include: execution_latency_seconds - match_type: "" - new_name: neuron_execution_latency - operations: [] - submatch_case: "" - - action: update - aggregation_type: "" - include: execution_status_total - match_type: "" - new_name: neuron_execution_status - operations: [] - submatch_case: "" - - action: update - aggregation_type: "" - include: neuron_runtime_memory_used_bytes - match_type: "" - new_name: neurondevice_runtime_memory_used_bytes - operations: [] - submatch_case: "" - - action: update - aggregation_type: "" - include: neuroncore_memory_usage_model_code - match_type: "" - new_name: neuroncore_memory_usage_model_code - operations: [] - submatch_case: "" - - action: update - aggregation_type: "" - include: neuroncore_memory_usage_runtime_memory - match_type: "" - new_name: neuroncore_memory_usage_runtime_memory - operations: [] - submatch_case: "" - - action: update - aggregation_type: "" - include: neuroncore_utilization_ratio - match_type: "" - new_name: neuroncore_utilization - operations: - - action: experimental_scale_value - aggregation_type: "" - experimental_scale: 100 - label: "" - label_value: "" - new_label: "" - new_value: "" - submatch_case: "" - - action: update - aggregation_type: "" - include: instance_info - match_type: "" - new_name: instance_info - operations: [] - submatch_case: "" - - action: update - aggregation_type: "" - include: neuron_hardware - match_type: "" - new_name: neuron_hardware - operations: [] - submatch_case: "" - - action: update - aggregation_type: "" - include: execution_errors_total - match_type: "" - new_name: neuron_execution_errors - operations: [] - submatch_case: "" - - action: update - aggregation_type: "" - include: neuroncore_memory_usage_constants - match_type: "" - new_name: neuroncore_memory_usage_constants - operations: [] - submatch_case: "" + batch/containerinsights: + metadata_cardinality_limit: 1000 + send_batch_max_size: 0 + send_batch_size: 8192 + timeout: 5s + batch/emf_logs: + metadata_cardinality_limit: 1000 + send_batch_max_size: 0 + send_batch_size: 8192 + timeout: 5s + gpuattributes/containerinsights: {} + metricstransform/containerinsights: + transforms: + - action: insert + aggregation_type: "" + experimental_match_labels: + code: ^5.* + include: apiserver_request_total + match_type: regexp + new_name: apiserver_request_total_5xx + submatch_case: "" + - action: insert + aggregation_type: "" + include: DCGM_FI_DEV_GPU_UTIL + match_type: "" + new_name: container_gpu_utilization + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: ContainerGPU + submatch_case: "" + - action: insert + aggregation_type: "" + include: DCGM_FI_DEV_GPU_UTIL + match_type: "" + new_name: pod_gpu_utilization + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: PodGPU + submatch_case: "" + - action: insert + aggregation_type: "" + include: DCGM_FI_DEV_GPU_UTIL + match_type: "" + new_name: node_gpu_utilization + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: NodeGPU + submatch_case: "" + - action: insert + aggregation_type: "" + include: DCGM_FI_DEV_FB_USED_PERCENT + match_type: "" + new_name: container_gpu_memory_utilization + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: ContainerGPU + - action: experimental_scale_value + aggregation_type: "" + experimental_scale: 100 + label: "" + label_value: "" + new_label: "" + new_value: "" + submatch_case: "" + - action: insert + aggregation_type: "" + include: DCGM_FI_DEV_FB_USED_PERCENT + match_type: "" + new_name: pod_gpu_memory_utilization + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: PodGPU + - action: experimental_scale_value + aggregation_type: "" + experimental_scale: 100 + label: "" + label_value: "" + new_label: "" + new_value: "" + submatch_case: "" + - action: insert + aggregation_type: "" + include: DCGM_FI_DEV_FB_USED_PERCENT + match_type: "" + new_name: node_gpu_memory_utilization + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: NodeGPU + - action: experimental_scale_value + aggregation_type: "" + experimental_scale: 100 + label: "" + label_value: "" + new_label: "" + new_value: "" + submatch_case: "" + - action: insert + aggregation_type: "" + include: DCGM_FI_DEV_FB_USED + match_type: "" + new_name: container_gpu_memory_used + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: ContainerGPU + - action: experimental_scale_value + aggregation_type: "" + experimental_scale: 1.048576e+06 + label: "" + label_value: "" + new_label: "" + new_value: "" + submatch_case: "" + - action: insert + aggregation_type: "" + include: DCGM_FI_DEV_FB_USED + match_type: "" + new_name: pod_gpu_memory_used + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: PodGPU + - action: experimental_scale_value + aggregation_type: "" + experimental_scale: 1.048576e+06 + label: "" + label_value: "" + new_label: "" + new_value: "" + submatch_case: "" + - action: insert + aggregation_type: "" + include: DCGM_FI_DEV_FB_USED + match_type: "" + new_name: node_gpu_memory_used + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: NodeGPU + - action: experimental_scale_value + aggregation_type: "" + experimental_scale: 1.048576e+06 + label: "" + label_value: "" + new_label: "" + new_value: "" + submatch_case: "" + - action: insert + aggregation_type: "" + include: DCGM_FI_DEV_FB_TOTAL + match_type: "" + new_name: container_gpu_memory_total + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: ContainerGPU + - action: experimental_scale_value + aggregation_type: "" + experimental_scale: 1.048576e+06 + label: "" + label_value: "" + new_label: "" + new_value: "" + submatch_case: "" + - action: insert + aggregation_type: "" + include: DCGM_FI_DEV_FB_TOTAL + match_type: "" + new_name: pod_gpu_memory_total + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: PodGPU + - action: experimental_scale_value + aggregation_type: "" + experimental_scale: 1.048576e+06 + label: "" + label_value: "" + new_label: "" + new_value: "" + submatch_case: "" + - action: insert + aggregation_type: "" + include: DCGM_FI_DEV_FB_TOTAL + match_type: "" + new_name: node_gpu_memory_total + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: NodeGPU + - action: experimental_scale_value + aggregation_type: "" + experimental_scale: 1.048576e+06 + label: "" + label_value: "" + new_label: "" + new_value: "" + submatch_case: "" + - action: insert + aggregation_type: "" + include: DCGM_FI_DEV_GPU_TEMP + match_type: "" + new_name: container_gpu_temperature + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: ContainerGPU + submatch_case: "" + - action: insert + aggregation_type: "" + include: DCGM_FI_DEV_GPU_TEMP + match_type: "" + new_name: pod_gpu_temperature + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: PodGPU + submatch_case: "" + - action: insert + aggregation_type: "" + include: DCGM_FI_DEV_GPU_TEMP + match_type: "" + new_name: node_gpu_temperature + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: NodeGPU + submatch_case: "" + - action: insert + aggregation_type: "" + include: DCGM_FI_DEV_POWER_USAGE + match_type: "" + new_name: container_gpu_power_draw + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: ContainerGPU + submatch_case: "" + - action: insert + aggregation_type: "" + include: DCGM_FI_DEV_POWER_USAGE + match_type: "" + new_name: pod_gpu_power_draw + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: PodGPU + submatch_case: "" + - action: insert + aggregation_type: "" + include: DCGM_FI_DEV_POWER_USAGE + match_type: "" + new_name: node_gpu_power_draw + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: NodeGPU + submatch_case: "" + - action: insert + aggregation_type: "" + include: pod_gpu_limit + match_type: "" + new_name: node_gpu_limit + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: NodeGPU + submatch_case: "" + - action: insert + aggregation_type: "" + include: pod_gpu_limit + match_type: "" + new_name: cluster_gpu_limit + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: ClusterGPU + submatch_case: "" + - action: insert + aggregation_type: "" + include: pod_gpu_request + match_type: "" + new_name: node_gpu_request + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: NodeGPU + submatch_case: "" + - action: insert + aggregation_type: "" + include: pod_gpu_request + match_type: "" + new_name: cluster_gpu_request + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: ClusterGPU + submatch_case: "" + - action: insert + aggregation_type: "" + include: pod_gpu_total + match_type: "" + new_name: node_gpu_total + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: NodeGPU + submatch_case: "" + - action: insert + aggregation_type: "" + include: pod_gpu_total + match_type: "" + new_name: cluster_gpu_total + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: ClusterGPU + submatch_case: "" + - action: update + aggregation_type: "" + include: neuron_runtime_memory_used_bytes + match_type: "" + new_name: neurondevice_runtime_memory_used_bytes + operations: [] + submatch_case: "" + - action: update + aggregation_type: "" + include: neuroncore_memory_usage_constants + match_type: "" + new_name: neuroncore_memory_usage_constants + operations: [] + submatch_case: "" + - action: update + aggregation_type: "" + include: neuroncore_memory_usage_model_shared_scratchpad + match_type: "" + new_name: neuroncore_memory_usage_model_shared_scratchpad + operations: [] + submatch_case: "" + - action: update + aggregation_type: "" + include: neuroncore_utilization_ratio + match_type: "" + new_name: neuroncore_utilization + operations: + - action: experimental_scale_value + aggregation_type: "" + experimental_scale: 100 + label: "" + label_value: "" + new_label: "" + new_value: "" + submatch_case: "" + - action: update + aggregation_type: "" + include: instance_info + match_type: "" + new_name: instance_info + operations: [] + submatch_case: "" + - action: update + aggregation_type: "" + include: neuron_hardware + match_type: "" + new_name: neuron_hardware + operations: [] + submatch_case: "" + - action: update + aggregation_type: "" + include: hardware_ecc_events_total + match_type: "" + new_name: neurondevice_hw_ecc_events + operations: [] + submatch_case: "" + - action: update + aggregation_type: "" + include: execution_status_total + match_type: "" + new_name: neuron_execution_status + operations: [] + submatch_case: "" + - action: update + aggregation_type: "" + include: execution_latency_seconds + match_type: "" + new_name: neuron_execution_latency + operations: [] + submatch_case: "" + - action: update + aggregation_type: "" + include: neuroncore_memory_usage_model_code + match_type: "" + new_name: neuroncore_memory_usage_model_code + operations: [] + submatch_case: "" + - action: update + aggregation_type: "" + include: neuroncore_memory_usage_runtime_memory + match_type: "" + new_name: neuroncore_memory_usage_runtime_memory + operations: [] + submatch_case: "" + - action: update + aggregation_type: "" + include: neuroncore_memory_usage_tensors + match_type: "" + new_name: neuroncore_memory_usage_tensors + operations: [] + submatch_case: "" + - action: update + aggregation_type: "" + include: execution_errors_total + match_type: "" + new_name: neuron_execution_errors + operations: [] + submatch_case: "" receivers: - awscontainerinsightreceiver: - accelerated_compute_metrics: true - add_container_name_metric_label: true - add_full_pod_name_metric_label: true - add_service_as_attribute: true - certificate_file_path: "" - cluster_name: TestCluster - collection_interval: 30s - container_orchestrator: eks - enable_control_plane_metrics: true - endpoint: "" - imds_retries: 2 - leader_lock_name: cwagent-clusterleader - leader_lock_using_config_map_only: true - local_mode: true - max_retries: 0 - no_verify_ssl: false - num_workers: 0 - prefer_full_pod_name: true - profile: default - proxy_address: "" - region: us-east-1 - request_timeout_seconds: 0 - resource_arn: "" - role_arn: "" - shared_credentials_file: - - /root/.aws/credentials - tcplog/emf_logs: - encoding: utf-8 - id: tcp_input - listen_address: 0.0.0.0:25888 - operators: [] - retry_on_failure: - enabled: false - initial_interval: 0s - max_elapsed_time: 0s - max_interval: 0s - type: tcp_input - udplog/emf_logs: - encoding: utf-8 - id: udp_input - listen_address: 0.0.0.0:25888 - multiline: - line_end_pattern: .^ - line_start_pattern: "" - omit_pattern: false - operators: [] - retry_on_failure: - enabled: false - initial_interval: 0s - max_elapsed_time: 0s - max_interval: 0s - type: udp_input + awscontainerinsightreceiver: + accelerated_compute_metrics: true + add_container_name_metric_label: true + add_full_pod_name_metric_label: true + add_service_as_attribute: true + certificate_file_path: "" + cluster_name: TestCluster + collection_interval: 30s + container_orchestrator: eks + enable_control_plane_metrics: true + endpoint: "" + imds_retries: 2 + leader_lock_name: cwagent-clusterleader + leader_lock_using_config_map_only: true + local_mode: true + max_retries: 0 + no_verify_ssl: false + num_workers: 0 + prefer_full_pod_name: true + profile: default + proxy_address: "" + region: us-east-1 + request_timeout_seconds: 0 + resource_arn: "" + role_arn: "" + shared_credentials_file: + - /root/.aws/credentials + tcplog/emf_logs: + encoding: utf-8 + id: tcp_input + listen_address: 0.0.0.0:25888 + operators: [] + retry_on_failure: + enabled: false + initial_interval: 0s + max_elapsed_time: 0s + max_interval: 0s + type: tcp_input + udplog/emf_logs: + encoding: utf-8 + id: udp_input + listen_address: 0.0.0.0:25888 + multiline: + line_end_pattern: .^ + line_start_pattern: "" + omit_pattern: false + operators: [] + retry_on_failure: + enabled: false + initial_interval: 0s + max_elapsed_time: 0s + max_interval: 0s + type: udp_input service: - extensions: - - agenthealth/logs - pipelines: - logs/emf_logs: - exporters: - - awscloudwatchlogs/emf_logs - processors: - - batch/emf_logs - receivers: - - tcplog/emf_logs - - udplog/emf_logs - metrics/containerinsights: - exporters: - - awsemf/containerinsights - processors: - - metricstransform/containerinsights - - gpuattributes/containerinsights - - batch/containerinsights - receivers: - - awscontainerinsightreceiver - telemetry: - logs: - development: false - disable_caller: false - disable_stacktrace: false - encoding: console - level: info - sampling: - enabled: true - initial: 2 - thereafter: 500 - tick: 10s - metrics: - address: "" - level: None - traces: {} + extensions: + - agenthealth/logs + pipelines: + logs/emf_logs: + exporters: + - awscloudwatchlogs/emf_logs + processors: + - batch/emf_logs + receivers: + - tcplog/emf_logs + - udplog/emf_logs + metrics/containerinsights: + exporters: + - awsemf/containerinsights + processors: + - metricstransform/containerinsights + - gpuattributes/containerinsights + - batch/containerinsights + receivers: + - awscontainerinsightreceiver + telemetry: + logs: + development: false + disable_caller: false + disable_stacktrace: false + encoding: console + level: info + sampling: + enabled: true + initial: 2 + thereafter: 500 + tick: 10s + metrics: + address: "" + level: None + traces: {} diff --git a/translator/translate/otel/exporter/awsemf/kubernetes.go b/translator/translate/otel/exporter/awsemf/kubernetes.go index c5db4fb26d..25367d7d0e 100644 --- a/translator/translate/otel/exporter/awsemf/kubernetes.go +++ b/translator/translate/otel/exporter/awsemf/kubernetes.go @@ -505,7 +505,15 @@ func getGPUMetricDeclarations(conf *confmap.Conf) []*awsemfexporter.MetricDeclar }, }, { - Dimensions: [][]string{{"ClusterName", "NodeName", "InstanceId"}, {"ClusterName"}}, + Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "Namespace"}, {"ClusterName", "Namespace", "Service"}, {"ClusterName", "Namespace", "PodName"}, {"ClusterName", "Namespace", "PodName", "FullPodName"}}, + MetricNameSelectors: []string{ + "pod_gpu_total", + "pod_gpu_request", + "pod_gpu_limit", + }, + }, + { + Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "NodeName", "InstanceId", "InstanceType"}}, MetricNameSelectors: []string{ "node_gpu_total", "node_gpu_request", @@ -515,8 +523,8 @@ func getGPUMetricDeclarations(conf *confmap.Conf) []*awsemfexporter.MetricDeclar { Dimensions: [][]string{{"ClusterName"}}, MetricNameSelectors: []string{ - "cluster_gpu_request", "cluster_gpu_total", + "cluster_gpu_request", }, }, }...) diff --git a/translator/translate/otel/exporter/awsemf/translator_test.go b/translator/translate/otel/exporter/awsemf/translator_test.go index 73ed9d41d6..e591ee1c7e 100644 --- a/translator/translate/otel/exporter/awsemf/translator_test.go +++ b/translator/translate/otel/exporter/awsemf/translator_test.go @@ -418,7 +418,13 @@ func TestTranslator(t *testing.T) { }, }, { - Dimensions: [][]string{{"ClusterName", "NodeName", "InstanceId"}, {"ClusterName"}}, + Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "Namespace"}, {"ClusterName", "Namespace", "Service"}, {"ClusterName", "Namespace", "PodName"}, {"ClusterName", "Namespace", "PodName", "FullPodName"}}, + MetricNameSelectors: []string{ + "pod_gpu_total", "pod_gpu_request", "pod_gpu_limit", + }, + }, + { + Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "NodeName", "InstanceId", "InstanceType"}}, MetricNameSelectors: []string{ "node_gpu_total", "node_gpu_request", "node_gpu_limit", }, @@ -426,7 +432,7 @@ func TestTranslator(t *testing.T) { { Dimensions: [][]string{{"ClusterName"}}, MetricNameSelectors: []string{ - "cluster_gpu_request", "cluster_gpu_total", + "cluster_gpu_total", "cluster_gpu_request", }, }, { diff --git a/translator/translate/otel/processor/metricstransformprocessor/translator.go b/translator/translate/otel/processor/metricstransformprocessor/translator.go index d50f5ded1e..e76a21d0cd 100644 --- a/translator/translate/otel/processor/metricstransformprocessor/translator.go +++ b/translator/translate/otel/processor/metricstransformprocessor/translator.go @@ -16,8 +16,6 @@ import ( "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/receiver/awscontainerinsight" ) -const gpuLogSuffix = "GPU" - var metricDuplicateTypes = []string{ containerinsightscommon.TypeGpuContainer, containerinsightscommon.TypeGpuPod, @@ -121,6 +119,36 @@ func (t *translator) Translate(conf *confmap.Conf) (component.Config, error) { } } + // replicate pod level nvidia gpu count metrics _limit, _request and _total for node and cluster + for _, m := range []string{containerinsightscommon.GpuLimit, containerinsightscommon.GpuRequest, containerinsightscommon.GpuTotal} { + transformRules = append(transformRules, []map[string]interface{}{ + { + "include": containerinsightscommon.MetricName(containerinsightscommon.TypePod, m), + "action": "insert", + "new_name": containerinsightscommon.MetricName(containerinsightscommon.TypeNode, m), + "operations": append([]map[string]interface{}{ + { + "action": "add_label", + "new_label": containerinsightscommon.MetricType, + "new_value": containerinsightscommon.TypeGpuNode, + }, + }), + }, + { + "include": containerinsightscommon.MetricName(containerinsightscommon.TypePod, m), + "action": "insert", + "new_name": containerinsightscommon.MetricName(containerinsightscommon.TypeCluster, m), + "operations": append([]map[string]interface{}{ + { + "action": "add_label", + "new_label": containerinsightscommon.MetricType, + "new_value": containerinsightscommon.TypeGpuCluster, + }, + }), + }, + }...) + } + for oldName, newName := range renameMapForNeuronMonitor { var operations []map[string]interface{} if newName == containerinsightscommon.NeuronCoreUtilization {