From 6d6978297a9bf6cdf68e625f7583d415b12abb0b Mon Sep 17 00:00:00 2001
From: Hyunsoo Kim <884273+movence@users.noreply.github.com>
Date: Wed, 29 May 2024 11:21:55 -0400
Subject: [PATCH] nvidia gpu count metrics and bugfix  (#1183)

---
 plugins/processors/gpuattributes/processor.go |   13 +-
 .../gpuattributes/processor_test.go           |  236 +-
 .../emf_and_kubernetes_with_gpu_config.yaml   | 2463 +++++++++--------
 .../otel/exporter/awsemf/kubernetes.go        |   12 +-
 .../otel/exporter/awsemf/translator_test.go   |   10 +-
 .../metricstransformprocessor/translator.go   |   32 +-
 6 files changed, 1497 insertions(+), 1269 deletions(-)

diff --git a/plugins/processors/gpuattributes/processor.go b/plugins/processors/gpuattributes/processor.go
index c62dc7e6b0..94fb411a53 100644
--- a/plugins/processors/gpuattributes/processor.go
+++ b/plugins/processors/gpuattributes/processor.go
@@ -134,7 +134,7 @@ func (d *gpuAttributesProcessor) processMetrics(_ context.Context, md pmetric.Me
 			ils := ilms.At(j)
 			metrics := ils.Metrics()
 
-			d.filterGpuMetricsWithoutPodName(metrics)
+			d.filterGpuMetricsWithoutPodName(metrics, rs.Resource().Attributes())
 
 			metricsLength := metrics.Len()
 			for k := 0; k < metricsLength; k++ {
@@ -227,15 +227,15 @@ func (d *gpuAttributesProcessor) filterAttributes(attributes pcommon.Map, labels
 }
 
 // remove dcgm metrics that do not contain PodName attribute which means there is no workload associated to container/pod
-func (d *gpuAttributesProcessor) filterGpuMetricsWithoutPodName(metrics pmetric.MetricSlice) {
+func (d *gpuAttributesProcessor) filterGpuMetricsWithoutPodName(metrics pmetric.MetricSlice, resourceAttributes pcommon.Map) {
 	metrics.RemoveIf(func(m pmetric.Metric) bool {
 		isGpu := strings.Contains(m.Name(), gpuMetricIdentifier)
 		isContainerOrPod := strings.HasPrefix(m.Name(), gpuContainerMetricPrefix) || strings.HasPrefix(m.Name(), gpuPodMetricPrefix)
-
 		if !isGpu || !isContainerOrPod {
 			return false
 		}
 
+		_, hasPodAtResource := resourceAttributes.Get(internal.PodName)
 		var dps pmetric.NumberDataPointSlice
 		switch m.Type() {
 		case pmetric.MetricTypeGauge:
@@ -246,7 +246,10 @@ func (d *gpuAttributesProcessor) filterGpuMetricsWithoutPodName(metrics pmetric.
 			d.logger.Debug("Ignore unknown metric type", zap.String(containerinsightscommon.MetricType, m.Type().String()))
 		}
 
-		_, hasPodInfo := dps.At(0).Attributes().Get(internal.PodName)
-		return !hasPodInfo
+		dps.RemoveIf(func(dp pmetric.NumberDataPoint) bool {
+			_, hasPodInfo := dp.Attributes().Get(internal.PodName)
+			return !hasPodInfo && !hasPodAtResource
+		})
+		return dps.Len() == 0
 	})
 }
diff --git a/plugins/processors/gpuattributes/processor_test.go b/plugins/processors/gpuattributes/processor_test.go
index 02bbd02ad7..a625945eda 100644
--- a/plugins/processors/gpuattributes/processor_test.go
+++ b/plugins/processors/gpuattributes/processor_test.go
@@ -19,93 +19,169 @@ func TestProcessMetrics(t *testing.T) {
 	ctx := context.Background()
 
 	testcases := map[string]struct {
-		resource string
-		metrics  pmetric.Metrics
-		wantCnt  int
-		want     map[string]string
+		resource      string
+		metrics       pmetric.Metrics
+		wantMetricCnt int
+		want          []map[string]string
 	}{
 		"nonNode": {
-			metrics: generateMetrics("prefix", map[string]string{
-				"ClusterName": "cluster",
+			metrics: generateMetrics("prefix", []map[string]string{
+				{
+					"ClusterName": "cluster",
+				},
 			}),
-			wantCnt: 1,
-			want: map[string]string{
-				"ClusterName": "cluster",
+			wantMetricCnt: 1,
+			want: []map[string]string{
+				{
+					"ClusterName": "cluster",
+				},
 			},
 		},
 		"nodeDropSimple": {
-			metrics: generateMetrics("node", map[string]string{
-				"ClusterName": "cluster",
-				"Drop":        "val",
+			metrics: generateMetrics("node", []map[string]string{
+				{
+					"ClusterName": "cluster",
+					"Drop":        "val",
+				},
 			}),
-			wantCnt: 1,
-			want: map[string]string{
-				"ClusterName": "cluster",
+			wantMetricCnt: 1,
+			want: []map[string]string{
+				{
+					"ClusterName": "cluster",
+				},
 			},
 		},
 		"nodeDropJson": {
-			metrics: generateMetrics("node", map[string]string{
-				"ClusterName": "cluster",
-				"kubernetes":  "{\"host\":\"test\"}",
+			metrics: generateMetrics("node", []map[string]string{
+				{
+					"ClusterName": "cluster",
+					"kubernetes":  "{\"host\":\"test\"}",
+				},
 			}),
-			wantCnt: 1,
-			want: map[string]string{
-				"ClusterName": "cluster",
-				"kubernetes":  "{\"host\":\"test\"}",
+			wantMetricCnt: 1,
+			want: []map[string]string{
+				{
+					"ClusterName": "cluster",
+					"kubernetes":  "{\"host\":\"test\"}",
+				},
 			},
 		},
 		"nodeDropMixed": {
-			metrics: generateMetrics("node", map[string]string{
-				"ClusterName": "cluster",
-				"Drop":        "val",
-				"kubernetes":  "{\"host\":\"test\",\"b\":\"2\"}",
+			metrics: generateMetrics("node", []map[string]string{
+				{
+					"ClusterName": "cluster",
+					"Drop":        "val",
+					"kubernetes":  "{\"host\":\"test\",\"b\":\"2\"}",
+				},
 			}),
-			wantCnt: 1,
-			want: map[string]string{
-				"ClusterName": "cluster",
-				"kubernetes":  "{\"host\":\"test\"}",
+			wantMetricCnt: 1,
+			want: []map[string]string{
+				{
+					"ClusterName": "cluster",
+					"kubernetes":  "{\"host\":\"test\"}",
+				},
 			},
 		},
 		"dropPodWithoutPodName": {
-			metrics: generateMetrics("pod", map[string]string{
-				"ClusterName": "cluster",
-				"kubernetes":  "{\"host\":\"test\",\"b\":\"2\"}",
+			metrics: generateMetrics("pod", []map[string]string{
+				{
+					"ClusterName": "cluster",
+					"kubernetes":  "{\"host\":\"test\",\"b\":\"2\"}",
+				},
 			}),
-			wantCnt: 0,
-			want:    map[string]string{},
+			wantMetricCnt: 0,
+			want:          []map[string]string{},
 		},
-		"keepPodWithoutPodName": {
-			metrics: generateMetrics("pod", map[string]string{
-				"ClusterName": "cluster",
-				"PodName":     "pod",
-				"kubernetes":  "{\"host\":\"test\",\"b\":\"2\"}",
+		"keepPodWithPodName": {
+			metrics: generateMetrics("pod", []map[string]string{
+				{
+					"ClusterName": "cluster",
+					"PodName":     "pod",
+					"kubernetes":  "{\"host\":\"test\",\"b\":\"2\"}",
+				},
 			}),
-			wantCnt: 1,
-			want: map[string]string{
-				"ClusterName": "cluster",
-				"PodName":     "pod",
-				"kubernetes":  "{\"host\":\"test\"}",
+			wantMetricCnt: 1,
+			want: []map[string]string{
+				{
+					"ClusterName": "cluster",
+					"PodName":     "pod",
+					"kubernetes":  "{\"host\":\"test\"}",
+				},
 			},
 		},
 		"dropContainerWithoutPodName": {
-			metrics: generateMetrics("container", map[string]string{
-				"ClusterName": "cluster",
-				"kubernetes":  "{\"host\":\"test\",\"b\":\"2\"}",
+			metrics: generateMetrics("container", []map[string]string{
+				{
+					"ClusterName": "cluster",
+					"kubernetes":  "{\"host\":\"test\",\"b\":\"2\"}",
+				},
 			}),
-			wantCnt: 0,
-			want:    map[string]string{},
+			wantMetricCnt: 0,
+			want:          []map[string]string{},
 		},
-		"keepContainerWithoutPodName": {
-			metrics: generateMetrics("container", map[string]string{
-				"ClusterName": "cluster",
-				"PodName":     "pod",
-				"kubernetes":  "{\"host\":\"test\",\"b\":\"2\"}",
+		"keepContainerWithPodName": {
+			metrics: generateMetrics("container", []map[string]string{
+				{
+					"ClusterName": "cluster",
+					"PodName":     "pod",
+					"kubernetes":  "{\"host\":\"test\",\"b\":\"2\"}",
+				},
 			}),
-			wantCnt: 1,
-			want: map[string]string{
-				"ClusterName": "cluster",
-				"PodName":     "pod",
-				"kubernetes":  "{\"host\":\"test\"}",
+			wantMetricCnt: 1,
+			want: []map[string]string{
+				{
+					"ClusterName": "cluster",
+					"PodName":     "pod",
+					"kubernetes":  "{\"host\":\"test\"}",
+				},
+			},
+		},
+		"dropSingleDatapointWithoutPodName": {
+			metrics: generateMetrics("container", []map[string]string{
+				{
+					"ClusterName": "cluster",
+					"kubernetes":  "{\"host\":\"test\",\"b\":\"2\"}",
+				},
+				{
+					"ClusterName": "cluster",
+					"PodName":     "pod",
+					"kubernetes":  "{\"host\":\"test\",\"b\":\"2\"}",
+				},
+			}),
+			wantMetricCnt: 1,
+			want: []map[string]string{
+				{
+					"ClusterName": "cluster",
+					"PodName":     "pod",
+					"kubernetes":  "{\"host\":\"test\"}",
+				},
+			},
+		},
+		"keepAllDatapoints": {
+			metrics: generateMetrics("container", []map[string]string{
+				{
+					"ClusterName": "cluster",
+					"PodName":     "pod1",
+					"kubernetes":  "{\"host\":\"test\",\"b\":\"2\"}",
+				},
+				{
+					"ClusterName": "cluster",
+					"PodName":     "pod2",
+					"kubernetes":  "{\"host\":\"test\",\"b\":\"2\"}",
+				},
+			}),
+			wantMetricCnt: 1,
+			want: []map[string]string{
+				{
+					"ClusterName": "cluster",
+					"PodName":     "pod1",
+					"kubernetes":  "{\"host\":\"test\"}",
+				},
+				{
+					"ClusterName": "cluster",
+					"PodName":     "pod2",
+					"kubernetes":  "{\"host\":\"test\"}",
+				},
 			},
 		},
 	}
@@ -113,30 +189,34 @@ func TestProcessMetrics(t *testing.T) {
 	for tname, tc := range testcases {
 		fmt.Printf("running %s\n", tname)
 		ms, _ := gp.processMetrics(ctx, tc.metrics)
-		assert.Equal(t, tc.wantCnt, ms.MetricCount())
-		if tc.wantCnt > 0 {
-			attrs := ms.ResourceMetrics().At(0).ScopeMetrics().At(0).Metrics().At(0).Gauge().DataPoints().At(0).Attributes()
-			assert.Equal(t, len(tc.want), attrs.Len())
-			for k, v := range tc.want {
-				got, ok := attrs.Get(k)
-				assert.True(t, ok)
-				assert.Equal(t, v, got.Str())
+		assert.Equal(t, tc.wantMetricCnt, ms.MetricCount())
+		if tc.wantMetricCnt > 0 {
+			dps := ms.ResourceMetrics().At(0).ScopeMetrics().At(0).Metrics().At(0).Gauge().DataPoints()
+			assert.Equal(t, len(tc.want), dps.Len())
+			for i, dim := range tc.want {
+				attrs := dps.At(i).Attributes()
+				assert.Equal(t, len(dim), attrs.Len())
+				for k, v := range dim {
+					got, ok := attrs.Get(k)
+					assert.True(t, ok)
+					assert.Equal(t, v, got.Str())
+				}
 			}
 		}
 	}
 }
 
-func generateMetrics(prefix string, dimensions map[string]string) pmetric.Metrics {
+func generateMetrics(prefix string, dimensions []map[string]string) pmetric.Metrics {
 	md := pmetric.NewMetrics()
-
-	m := md.ResourceMetrics().AppendEmpty().ScopeMetrics().AppendEmpty().Metrics().AppendEmpty()
-	m.SetName(prefix + gpuMetricIdentifier)
-	gauge := m.SetEmptyGauge().DataPoints().AppendEmpty()
-	gauge.SetIntValue(10)
-
-	for k, v := range dimensions {
-		gauge.Attributes().PutStr(k, v)
+	ms := md.ResourceMetrics().AppendEmpty().ScopeMetrics().AppendEmpty().Metrics().AppendEmpty()
+	ms.SetName(prefix + gpuMetricIdentifier)
+	dps := ms.SetEmptyGauge().DataPoints()
+	for _, dim := range dimensions {
+		dp := dps.AppendEmpty()
+		dp.SetIntValue(10)
+		for k, v := range dim {
+			dp.Attributes().PutStr(k, v)
+		}
 	}
-
 	return md
 }
diff --git a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml
index f8137e88c9..1be722d577 100644
--- a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml
+++ b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml
@@ -1,1185 +1,1288 @@
 exporters:
-    awscloudwatchlogs/emf_logs:
-        certificate_file_path: ""
-        emf_only: true
-        endpoint: https://fake_endpoint
-        imds_retries: 2
-        local_mode: true
-        log_group_name: emf/logs/default
-        log_retention: 0
-        log_stream_name: host_name_from_env
-        max_retries: 2
-        middleware: agenthealth/logs
-        no_verify_ssl: false
-        num_workers: 8
-        profile: default
-        proxy_address: ""
-        raw_log: true
-        region: us-east-1
-        request_timeout_seconds: 30
-        resource_arn: ""
-        retry_on_failure:
-            enabled: true
-            initial_interval: 5s
-            max_elapsed_time: 5m0s
-            max_interval: 30s
-            multiplier: 1.5
-            randomization_factor: 0.5
-        role_arn: ""
-        sending_queue:
-            enabled: true
-            num_consumers: 1
-            queue_size: 1000
-        shared_credentials_file:
-            - /root/.aws/credentials
-    awsemf/containerinsights:
-        certificate_file_path: ""
-        detailed_metrics: false
-        dimension_rollup_option: NoDimensionRollup
-        disable_metric_extraction: true
-        eks_fargate_container_insights_enabled: false
-        endpoint: https://fake_endpoint
-        enhanced_container_insights: true
-        imds_retries: 2
-        local_mode: true
-        log_group_name: /aws/containerinsights/{ClusterName}/performance
-        log_retention: 0
-        log_stream_name: '{NodeName}'
-        max_retries: 2
-        metric_declarations:
-            - dimensions:
-                - - ClusterName
-                - - ClusterName
-                  - ContainerName
-                  - FullPodName
-                  - Namespace
-                  - PodName
-                - - ClusterName
-                  - ContainerName
-                  - Namespace
-                  - PodName
-              metric_name_selectors:
-                - container_cpu_utilization
-                - container_cpu_utilization_over_container_limit
-                - container_cpu_limit
-                - container_cpu_request
-                - container_memory_utilization
-                - container_memory_utilization_over_container_limit
-                - container_memory_failures_total
-                - container_memory_limit
-                - container_memory_request
-                - container_filesystem_usage
-                - container_filesystem_available
-                - container_filesystem_utilization
-            - dimensions:
-                - - ClusterName
-                  - Namespace
-                  - PodName
-                - - ClusterName
-                - - ClusterName
-                  - Namespace
-                  - Service
-                - - ClusterName
-                  - Namespace
-                - - ClusterName
-                  - FullPodName
-                  - Namespace
-                  - PodName
-              metric_name_selectors:
-                - pod_cpu_utilization
-                - pod_memory_utilization
-                - pod_network_rx_bytes
-                - pod_network_tx_bytes
-                - pod_cpu_utilization_over_pod_limit
-                - pod_memory_utilization_over_pod_limit
-            - dimensions:
-                - - ClusterName
-                  - FullPodName
-                  - Namespace
-                  - PodName
-                - - ClusterName
-                  - Namespace
-                  - PodName
-                - - ClusterName
-                  - Namespace
-                - - ClusterName
-              metric_name_selectors:
-                - pod_interface_network_rx_dropped
-                - pod_interface_network_tx_dropped
-            - dimensions:
-                - - ClusterName
-                  - Namespace
-                  - PodName
-                - - ClusterName
-                - - ClusterName
-                  - FullPodName
-                  - Namespace
-                  - PodName
-                - - ClusterName
-                  - Namespace
-                  - Service
-              metric_name_selectors:
-                - pod_cpu_reserved_capacity
-                - pod_memory_reserved_capacity
-                - pod_number_of_container_restarts
-                - pod_number_of_containers
-                - pod_number_of_running_containers
-                - pod_status_ready
-                - pod_status_scheduled
-                - pod_status_running
-                - pod_status_pending
-                - pod_status_failed
-                - pod_status_unknown
-                - pod_status_succeeded
-                - pod_memory_request
-                - pod_memory_limit
-                - pod_cpu_limit
-                - pod_cpu_request
-                - pod_container_status_running
-                - pod_container_status_terminated
-                - pod_container_status_waiting
-                - pod_container_status_waiting_reason_crash_loop_back_off
-                - pod_container_status_waiting_reason_image_pull_error
-                - pod_container_status_waiting_reason_start_error
-                - pod_container_status_waiting_reason_create_container_error
-                - pod_container_status_waiting_reason_create_container_config_error
-                - pod_container_status_terminated_reason_oom_killed
-            - dimensions:
-                - - ClusterName
-                  - InstanceId
-                  - NodeName
-                - - ClusterName
-              metric_name_selectors:
-                - node_cpu_utilization
-                - node_memory_utilization
-                - node_network_total_bytes
-                - node_cpu_reserved_capacity
-                - node_memory_reserved_capacity
-                - node_number_of_running_pods
-                - node_number_of_running_containers
-                - node_cpu_usage_total
-                - node_cpu_limit
-                - node_memory_working_set
-                - node_memory_limit
-                - node_status_condition_ready
-                - node_status_condition_disk_pressure
-                - node_status_condition_memory_pressure
-                - node_status_condition_pid_pressure
-                - node_status_condition_network_unavailable
-                - node_status_condition_unknown
-                - node_status_capacity_pods
-                - node_status_allocatable_pods
-            - dimensions:
-                - - ClusterName
-                  - InstanceId
-                  - NodeName
-                - - ClusterName
-              metric_name_selectors:
-                - node_interface_network_rx_dropped
-                - node_interface_network_tx_dropped
-                - node_diskio_io_service_bytes_total
-                - node_diskio_io_serviced_total
-            - dimensions:
-                - - ClusterName
-                  - InstanceId
-                  - NodeName
-                - - ClusterName
-              metric_name_selectors:
-                - node_filesystem_utilization
-                - node_filesystem_inodes
-                - node_filesystem_inodes_free
-            - dimensions:
-                - - ClusterName
-                  - Namespace
-                  - Service
-                - - ClusterName
-              metric_name_selectors:
-                - service_number_of_running_pods
-            - dimensions:
-                - - ClusterName
-                  - Namespace
-                  - PodName
-                - - ClusterName
-              metric_name_selectors:
-                - replicas_desired
-                - replicas_ready
-                - status_replicas_available
-                - status_replicas_unavailable
-            - dimensions:
-                - - ClusterName
-                  - Namespace
-                  - PodName
-                - - ClusterName
-              metric_name_selectors:
-                - daemonset_status_number_available
-                - daemonset_status_number_unavailable
-            - dimensions:
-                - - ClusterName
-                  - Namespace
-                - - ClusterName
-              metric_name_selectors:
-                - namespace_number_of_running_pods
-            - dimensions:
-                - - ClusterName
-              metric_name_selectors:
-                - cluster_node_count
-                - cluster_failed_node_count
-                - cluster_number_of_running_pods
-            - dimensions:
-                - - ClusterName
-                  - endpoint
-                - - ClusterName
-              metric_name_selectors:
-                - apiserver_storage_size_bytes
-                - apiserver_storage_db_total_size_in_bytes
-                - etcd_db_total_size_in_bytes
-            - dimensions:
-                - - ClusterName
-                  - resource
-                - - ClusterName
-              metric_name_selectors:
-                - apiserver_storage_list_duration_seconds
-                - apiserver_longrunning_requests
-                - apiserver_storage_objects
-            - dimensions:
-                - - ClusterName
-                  - verb
-                - - ClusterName
-              metric_name_selectors:
-                - apiserver_request_duration_seconds
-                - rest_client_request_duration_seconds
-            - dimensions:
-                - - ClusterName
-                  - code
-                  - verb
-                - - ClusterName
-              metric_name_selectors:
-                - apiserver_request_total
-                - apiserver_request_total_5xx
-            - dimensions:
-                - - ClusterName
-                  - operation
-                - - ClusterName
-              metric_name_selectors:
-                - apiserver_admission_controller_admission_duration_seconds
-                - apiserver_admission_step_admission_duration_seconds
-                - etcd_request_duration_seconds
-            - dimensions:
-                - - ClusterName
-                  - code
-                  - method
-                - - ClusterName
-              metric_name_selectors:
-                - rest_client_requests_total
-            - dimensions:
-                - - ClusterName
-                  - request_kind
-                - - ClusterName
-              metric_name_selectors:
-                - apiserver_current_inflight_requests
-                - apiserver_current_inqueue_requests
-            - dimensions:
-                - - ClusterName
-                  - name
-                - - ClusterName
-              metric_name_selectors:
-                - apiserver_admission_webhook_admission_duration_seconds
-            - dimensions:
-                - - ClusterName
-                  - group
-                - - ClusterName
-              metric_name_selectors:
-                - apiserver_requested_deprecated_apis
-            - dimensions:
-                - - ClusterName
-                  - reason
-                - - ClusterName
-              metric_name_selectors:
-                - apiserver_flowcontrol_rejected_requests_total
-            - dimensions:
-                - - ClusterName
-                  - priority_level
-                - - ClusterName
-              metric_name_selectors:
-                - apiserver_flowcontrol_request_concurrency_limit
-            - dimensions:
-                - - ClusterName
-                - - ClusterName
-                  - ContainerName
-                  - Namespace
-                  - PodName
-                - - ClusterName
-                  - ContainerName
-                  - FullPodName
-                  - Namespace
-                  - PodName
-                - - ClusterName
-                  - ContainerName
-                  - FullPodName
-                  - GpuDevice
-                  - Namespace
-                  - PodName
-              metric_name_selectors:
-                - container_gpu_utilization
-                - container_gpu_memory_utilization
-                - container_gpu_memory_total
-                - container_gpu_memory_used
-                - container_gpu_power_draw
-                - container_gpu_temperature
-            - dimensions:
-                - - ClusterName
-                - - ClusterName
-                  - Namespace
-                - - ClusterName
-                  - Namespace
-                  - Service
-                - - ClusterName
-                  - Namespace
-                  - PodName
-                - - ClusterName
-                  - FullPodName
-                  - Namespace
-                  - PodName
-                - - ClusterName
-                  - FullPodName
-                  - GpuDevice
-                  - Namespace
-                  - PodName
-              metric_name_selectors:
-                - pod_gpu_utilization
-                - pod_gpu_memory_utilization
-                - pod_gpu_memory_total
-                - pod_gpu_memory_used
-                - pod_gpu_power_draw
-                - pod_gpu_temperature
-            - dimensions:
-                - - ClusterName
-                - - ClusterName
-                  - InstanceId
-                  - NodeName
-                - - ClusterName
-                  - GpuDevice
-                  - InstanceId
-                  - InstanceType
-                  - NodeName
-              metric_name_selectors:
-                - node_gpu_utilization
-                - node_gpu_memory_utilization
-                - node_gpu_memory_total
-                - node_gpu_memory_used
-                - node_gpu_power_draw
-                - node_gpu_temperature
-            - dimensions:
-                - - ClusterName
-                  - InstanceId
-                  - NodeName
-                - - ClusterName
-              metric_name_selectors:
-                - node_gpu_total
-                - node_gpu_request
-                - node_gpu_limit
-            - dimensions:
-                - - ClusterName
-              metric_name_selectors:
-                - cluster_gpu_request
-                - cluster_gpu_total
-            - dimensions:
-                - - ClusterName
-                - - ClusterName
-                  - ContainerName
-                  - Namespace
-                  - PodName
-                - - ClusterName
-                  - ContainerName
-                  - FullPodName
-                  - Namespace
-                  - PodName
-                - - ClusterName
-                  - ContainerName
-                  - FullPodName
-                  - Namespace
-                  - NeuronCore
-                  - NeuronDevice
-                  - PodName
-              metric_name_selectors:
-                - container_neuroncore_utilization
-                - container_neuroncore_memory_usage_total
-                - container_neuroncore_memory_usage_constants
-                - container_neuroncore_memory_usage_model_code
-                - container_neuroncore_memory_usage_model_shared_scratchpad
-                - container_neuroncore_memory_usage_runtime_memory
-                - container_neuroncore_memory_usage_tensors
-            - dimensions:
-                - - ClusterName
-                - - ClusterName
-                  - ContainerName
-                  - Namespace
-                  - PodName
-                - - ClusterName
-                  - ContainerName
-                  - FullPodName
-                  - Namespace
-                  - PodName
-                - - ClusterName
-                  - ContainerName
-                  - FullPodName
-                  - Namespace
-                  - NeuronDevice
-                  - PodName
-              metric_name_selectors:
-                - container_neurondevice_hw_ecc_events_total
-            - dimensions:
-                - - ClusterName
-                - - ClusterName
-                  - Namespace
-                - - ClusterName
-                  - Namespace
-                  - Service
-                - - ClusterName
-                  - Namespace
-                  - PodName
-                - - ClusterName
-                  - FullPodName
-                  - Namespace
-                  - PodName
-                - - ClusterName
-                  - FullPodName
-                  - Namespace
-                  - NeuronCore
-                  - NeuronDevice
-                  - PodName
-              metric_name_selectors:
-                - pod_neuroncore_utilization
-                - pod_neuroncore_memory_usage_total
-                - pod_neuroncore_memory_usage_constants
-                - pod_neuroncore_memory_usage_model_code
-                - pod_neuroncore_memory_usage_model_shared_scratchpad
-                - pod_neuroncore_memory_usage_runtime_memory
-                - pod_neuroncore_memory_usage_tensors
-            - dimensions:
-                - - ClusterName
-                - - ClusterName
-                  - Namespace
-                - - ClusterName
-                  - Namespace
-                  - Service
-                - - ClusterName
-                  - Namespace
-                  - PodName
-                - - ClusterName
-                  - FullPodName
-                  - Namespace
-                  - PodName
-                - - ClusterName
-                  - FullPodName
-                  - Namespace
-                  - NeuronDevice
-                  - PodName
-              metric_name_selectors:
-                - pod_neurondevice_hw_ecc_events_total
-            - dimensions:
-                - - ClusterName
-                - - ClusterName
-                  - InstanceId
-                  - NodeName
-                - - ClusterName
-                  - InstanceId
-                  - InstanceType
-                  - NeuronCore
-                  - NeuronDevice
-                  - NodeName
-              metric_name_selectors:
-                - node_neuroncore_utilization
-                - node_neuroncore_memory_usage_total
-                - node_neuroncore_memory_usage_constants
-                - node_neuroncore_memory_usage_model_code
-                - node_neuroncore_memory_usage_model_shared_scratchpad
-                - node_neuroncore_memory_usage_runtime_memory
-                - node_neuroncore_memory_usage_tensors
-            - dimensions:
-                - - ClusterName
-                - - ClusterName
-                  - InstanceId
-                  - NodeName
-              metric_name_selectors:
-                - node_neuron_execution_errors_total
-                - node_neurondevice_runtime_memory_used_bytes
-                - node_neuron_execution_latency
-            - dimensions:
-                - - ClusterName
-                - - ClusterName
-                  - InstanceId
-                  - NodeName
-                - - ClusterName
-                  - InstanceId
-                  - NeuronDevice
-                  - NodeName
-              metric_name_selectors:
-                - node_neurondevice_hw_ecc_events_total
-            - dimensions:
-                - - ClusterName
-                - - ClusterName
-                  - ContainerName
-                  - Namespace
-                  - PodName
-                - - ClusterName
-                  - ContainerName
-                  - FullPodName
-                  - Namespace
-                  - PodName
-              metric_name_selectors:
-                - container_efa_rx_bytes
-                - container_efa_tx_bytes
-                - container_efa_rx_dropped
-                - container_efa_rdma_read_bytes
-                - container_efa_rdma_write_bytes
-                - container_efa_rdma_write_recv_bytes
-            - dimensions:
-                - - ClusterName
-                - - ClusterName
-                  - Namespace
-                - - ClusterName
-                  - Namespace
-                  - Service
-                - - ClusterName
-                  - Namespace
-                  - PodName
-                - - ClusterName
-                  - FullPodName
-                  - Namespace
-                  - PodName
-              metric_name_selectors:
-                - pod_efa_rx_bytes
-                - pod_efa_tx_bytes
-                - pod_efa_rx_dropped
-                - pod_efa_rdma_read_bytes
-                - pod_efa_rdma_write_bytes
-                - pod_efa_rdma_write_recv_bytes
-            - dimensions:
-                - - ClusterName
-                - - ClusterName
-                  - InstanceId
-                  - NodeName
-              metric_name_selectors:
-                - node_efa_rx_bytes
-                - node_efa_tx_bytes
-                - node_efa_rx_dropped
-                - node_efa_rdma_read_bytes
-                - node_efa_rdma_write_bytes
-                - node_efa_rdma_write_recv_bytes
-        metric_descriptors:
-            - metric_name: apiserver_admission_controller_admission_duration_seconds
-              overwrite: true
-              unit: Seconds
-            - metric_name: apiserver_admission_step_admission_duration_seconds
-              overwrite: true
-              unit: Seconds
-            - metric_name: apiserver_admission_webhook_admission_duration_seconds
-              overwrite: true
-              unit: Seconds
-            - metric_name: apiserver_current_inflight_requests
-              overwrite: true
-              unit: Count
-            - metric_name: apiserver_current_inqueue_requests
-              overwrite: true
-              unit: Count
-            - metric_name: apiserver_flowcontrol_rejected_requests_total
-              overwrite: true
-              unit: Count
-            - metric_name: apiserver_flowcontrol_request_concurrency_limit
-              overwrite: true
-              unit: Count
-            - metric_name: apiserver_longrunning_requests
-              overwrite: true
-              unit: Count
-            - metric_name: apiserver_request_duration_seconds
-              overwrite: true
-              unit: Seconds
-            - metric_name: apiserver_request_total
-              overwrite: true
-              unit: Count
-            - metric_name: apiserver_request_total_5xx
-              overwrite: true
-              unit: Count
-            - metric_name: apiserver_requested_deprecated_apis
-              overwrite: true
-              unit: Count
-            - metric_name: apiserver_storage_objects
-              overwrite: true
-              unit: Count
-            - metric_name: etcd_request_duration_seconds
-              overwrite: true
-              unit: Seconds
-            - metric_name: apiserver_storage_list_duration_seconds
-              overwrite: true
-              unit: Seconds
-            - metric_name: apiserver_storage_db_total_size_in_bytes
-              overwrite: true
-              unit: Bytes
-            - metric_name: apiserver_storage_size_bytes
-              overwrite: true
-              unit: Bytes
-            - metric_name: etcd_db_total_size_in_bytes
-              overwrite: true
-              unit: Bytes
-            - metric_name: rest_client_request_duration_seconds
-              overwrite: true
-              unit: Seconds
-            - metric_name: rest_client_requests_total
-              overwrite: true
-              unit: Count
-        middleware: agenthealth/logs
-        namespace: ContainerInsights
-        no_verify_ssl: false
-        num_workers: 8
-        output_destination: cloudwatch
-        parse_json_encoded_attr_values:
-            - Sources
-            - kubernetes
-        profile: default
-        proxy_address: ""
-        region: us-east-1
-        request_timeout_seconds: 30
-        resource_arn: ""
-        resource_to_telemetry_conversion:
-            enabled: true
-        retain_initial_value_of_delta_metric: false
-        role_arn: ""
-        shared_credentials_file:
-            - /root/.aws/credentials
-        version: "0"
+  awscloudwatchlogs/emf_logs:
+    certificate_file_path: ""
+    emf_only: true
+    endpoint: https://fake_endpoint
+    imds_retries: 2
+    local_mode: true
+    log_group_name: emf/logs/default
+    log_retention: 0
+    log_stream_name: host_name_from_env
+    max_retries: 2
+    middleware: agenthealth/logs
+    no_verify_ssl: false
+    num_workers: 8
+    profile: default
+    proxy_address: ""
+    raw_log: true
+    region: us-east-1
+    request_timeout_seconds: 30
+    resource_arn: ""
+    retry_on_failure:
+      enabled: true
+      initial_interval: 5s
+      max_elapsed_time: 5m0s
+      max_interval: 30s
+      multiplier: 1.5
+      randomization_factor: 0.5
+    role_arn: ""
+    sending_queue:
+      enabled: true
+      num_consumers: 1
+      queue_size: 1000
+    shared_credentials_file:
+      - /root/.aws/credentials
+  awsemf/containerinsights:
+    certificate_file_path: ""
+    detailed_metrics: false
+    dimension_rollup_option: NoDimensionRollup
+    disable_metric_extraction: true
+    eks_fargate_container_insights_enabled: false
+    endpoint: https://fake_endpoint
+    enhanced_container_insights: true
+    imds_retries: 2
+    local_mode: true
+    log_group_name: /aws/containerinsights/{ClusterName}/performance
+    log_retention: 0
+    log_stream_name: '{NodeName}'
+    max_retries: 2
+    metric_declarations:
+      - dimensions:
+          - - ClusterName
+          - - ClusterName
+            - ContainerName
+            - FullPodName
+            - Namespace
+            - PodName
+          - - ClusterName
+            - ContainerName
+            - Namespace
+            - PodName
+        metric_name_selectors:
+          - container_cpu_utilization
+          - container_cpu_utilization_over_container_limit
+          - container_cpu_limit
+          - container_cpu_request
+          - container_memory_utilization
+          - container_memory_utilization_over_container_limit
+          - container_memory_failures_total
+          - container_memory_limit
+          - container_memory_request
+          - container_filesystem_usage
+          - container_filesystem_available
+          - container_filesystem_utilization
+      - dimensions:
+          - - ClusterName
+            - Namespace
+            - PodName
+          - - ClusterName
+          - - ClusterName
+            - Namespace
+            - Service
+          - - ClusterName
+            - Namespace
+          - - ClusterName
+            - FullPodName
+            - Namespace
+            - PodName
+        metric_name_selectors:
+          - pod_cpu_utilization
+          - pod_memory_utilization
+          - pod_network_rx_bytes
+          - pod_network_tx_bytes
+          - pod_cpu_utilization_over_pod_limit
+          - pod_memory_utilization_over_pod_limit
+      - dimensions:
+          - - ClusterName
+            - FullPodName
+            - Namespace
+            - PodName
+          - - ClusterName
+            - Namespace
+            - PodName
+          - - ClusterName
+            - Namespace
+          - - ClusterName
+        metric_name_selectors:
+          - pod_interface_network_rx_dropped
+          - pod_interface_network_tx_dropped
+      - dimensions:
+          - - ClusterName
+            - Namespace
+            - PodName
+          - - ClusterName
+          - - ClusterName
+            - FullPodName
+            - Namespace
+            - PodName
+          - - ClusterName
+            - Namespace
+            - Service
+        metric_name_selectors:
+          - pod_cpu_reserved_capacity
+          - pod_memory_reserved_capacity
+          - pod_number_of_container_restarts
+          - pod_number_of_containers
+          - pod_number_of_running_containers
+          - pod_status_ready
+          - pod_status_scheduled
+          - pod_status_running
+          - pod_status_pending
+          - pod_status_failed
+          - pod_status_unknown
+          - pod_status_succeeded
+          - pod_memory_request
+          - pod_memory_limit
+          - pod_cpu_limit
+          - pod_cpu_request
+          - pod_container_status_running
+          - pod_container_status_terminated
+          - pod_container_status_waiting
+          - pod_container_status_waiting_reason_crash_loop_back_off
+          - pod_container_status_waiting_reason_image_pull_error
+          - pod_container_status_waiting_reason_start_error
+          - pod_container_status_waiting_reason_create_container_error
+          - pod_container_status_waiting_reason_create_container_config_error
+          - pod_container_status_terminated_reason_oom_killed
+      - dimensions:
+          - - ClusterName
+            - InstanceId
+            - NodeName
+          - - ClusterName
+        metric_name_selectors:
+          - node_cpu_utilization
+          - node_memory_utilization
+          - node_network_total_bytes
+          - node_cpu_reserved_capacity
+          - node_memory_reserved_capacity
+          - node_number_of_running_pods
+          - node_number_of_running_containers
+          - node_cpu_usage_total
+          - node_cpu_limit
+          - node_memory_working_set
+          - node_memory_limit
+          - node_status_condition_ready
+          - node_status_condition_disk_pressure
+          - node_status_condition_memory_pressure
+          - node_status_condition_pid_pressure
+          - node_status_condition_network_unavailable
+          - node_status_condition_unknown
+          - node_status_capacity_pods
+          - node_status_allocatable_pods
+      - dimensions:
+          - - ClusterName
+            - InstanceId
+            - NodeName
+          - - ClusterName
+        metric_name_selectors:
+          - node_interface_network_rx_dropped
+          - node_interface_network_tx_dropped
+          - node_diskio_io_service_bytes_total
+          - node_diskio_io_serviced_total
+      - dimensions:
+          - - ClusterName
+            - InstanceId
+            - NodeName
+          - - ClusterName
+        metric_name_selectors:
+          - node_filesystem_utilization
+          - node_filesystem_inodes
+          - node_filesystem_inodes_free
+      - dimensions:
+          - - ClusterName
+            - Namespace
+            - Service
+          - - ClusterName
+        metric_name_selectors:
+          - service_number_of_running_pods
+      - dimensions:
+          - - ClusterName
+            - Namespace
+            - PodName
+          - - ClusterName
+        metric_name_selectors:
+          - replicas_desired
+          - replicas_ready
+          - status_replicas_available
+          - status_replicas_unavailable
+      - dimensions:
+          - - ClusterName
+            - Namespace
+            - PodName
+          - - ClusterName
+        metric_name_selectors:
+          - daemonset_status_number_available
+          - daemonset_status_number_unavailable
+      - dimensions:
+          - - ClusterName
+            - Namespace
+          - - ClusterName
+        metric_name_selectors:
+          - namespace_number_of_running_pods
+      - dimensions:
+          - - ClusterName
+        metric_name_selectors:
+          - cluster_node_count
+          - cluster_failed_node_count
+          - cluster_number_of_running_pods
+      - dimensions:
+          - - ClusterName
+            - endpoint
+          - - ClusterName
+        metric_name_selectors:
+          - apiserver_storage_size_bytes
+          - apiserver_storage_db_total_size_in_bytes
+          - etcd_db_total_size_in_bytes
+      - dimensions:
+          - - ClusterName
+            - resource
+          - - ClusterName
+        metric_name_selectors:
+          - apiserver_storage_list_duration_seconds
+          - apiserver_longrunning_requests
+          - apiserver_storage_objects
+      - dimensions:
+          - - ClusterName
+            - verb
+          - - ClusterName
+        metric_name_selectors:
+          - apiserver_request_duration_seconds
+          - rest_client_request_duration_seconds
+      - dimensions:
+          - - ClusterName
+            - code
+            - verb
+          - - ClusterName
+        metric_name_selectors:
+          - apiserver_request_total
+          - apiserver_request_total_5xx
+      - dimensions:
+          - - ClusterName
+            - operation
+          - - ClusterName
+        metric_name_selectors:
+          - apiserver_admission_controller_admission_duration_seconds
+          - apiserver_admission_step_admission_duration_seconds
+          - etcd_request_duration_seconds
+      - dimensions:
+          - - ClusterName
+            - code
+            - method
+          - - ClusterName
+        metric_name_selectors:
+          - rest_client_requests_total
+      - dimensions:
+          - - ClusterName
+            - request_kind
+          - - ClusterName
+        metric_name_selectors:
+          - apiserver_current_inflight_requests
+          - apiserver_current_inqueue_requests
+      - dimensions:
+          - - ClusterName
+            - name
+          - - ClusterName
+        metric_name_selectors:
+          - apiserver_admission_webhook_admission_duration_seconds
+      - dimensions:
+          - - ClusterName
+            - group
+          - - ClusterName
+        metric_name_selectors:
+          - apiserver_requested_deprecated_apis
+      - dimensions:
+          - - ClusterName
+            - reason
+          - - ClusterName
+        metric_name_selectors:
+          - apiserver_flowcontrol_rejected_requests_total
+      - dimensions:
+          - - ClusterName
+            - priority_level
+          - - ClusterName
+        metric_name_selectors:
+          - apiserver_flowcontrol_request_concurrency_limit
+      - dimensions:
+          - - ClusterName
+          - - ClusterName
+            - ContainerName
+            - Namespace
+            - PodName
+          - - ClusterName
+            - ContainerName
+            - FullPodName
+            - Namespace
+            - PodName
+          - - ClusterName
+            - ContainerName
+            - FullPodName
+            - GpuDevice
+            - Namespace
+            - PodName
+        metric_name_selectors:
+          - container_gpu_utilization
+          - container_gpu_memory_utilization
+          - container_gpu_memory_total
+          - container_gpu_memory_used
+          - container_gpu_power_draw
+          - container_gpu_temperature
+      - dimensions:
+          - - ClusterName
+          - - ClusterName
+            - Namespace
+          - - ClusterName
+            - Namespace
+            - Service
+          - - ClusterName
+            - Namespace
+            - PodName
+          - - ClusterName
+            - FullPodName
+            - Namespace
+            - PodName
+          - - ClusterName
+            - FullPodName
+            - GpuDevice
+            - Namespace
+            - PodName
+        metric_name_selectors:
+          - pod_gpu_utilization
+          - pod_gpu_memory_utilization
+          - pod_gpu_memory_total
+          - pod_gpu_memory_used
+          - pod_gpu_power_draw
+          - pod_gpu_temperature
+      - dimensions:
+          - - ClusterName
+          - - ClusterName
+            - InstanceId
+            - NodeName
+          - - ClusterName
+            - GpuDevice
+            - InstanceId
+            - InstanceType
+            - NodeName
+        metric_name_selectors:
+          - node_gpu_utilization
+          - node_gpu_memory_utilization
+          - node_gpu_memory_total
+          - node_gpu_memory_used
+          - node_gpu_power_draw
+          - node_gpu_temperature
+      - dimensions:
+          - - ClusterName
+          - - ClusterName
+            - Namespace
+          - - ClusterName
+            - Namespace
+            - Service
+          - - ClusterName
+            - Namespace
+            - PodName
+          - - ClusterName
+            - FullPodName
+            - Namespace
+            - PodName
+        metric_name_selectors:
+          - pod_gpu_total
+          - pod_gpu_request
+          - pod_gpu_limit
+      - dimensions:
+          - - ClusterName
+          - - ClusterName
+            - InstanceId
+            - InstanceType
+            - NodeName
+        metric_name_selectors:
+          - node_gpu_total
+          - node_gpu_request
+          - node_gpu_limit
+      - dimensions:
+          - - ClusterName
+        metric_name_selectors:
+          - cluster_gpu_total
+          - cluster_gpu_request
+      - dimensions:
+          - - ClusterName
+          - - ClusterName
+            - ContainerName
+            - Namespace
+            - PodName
+          - - ClusterName
+            - ContainerName
+            - FullPodName
+            - Namespace
+            - PodName
+          - - ClusterName
+            - ContainerName
+            - FullPodName
+            - Namespace
+            - NeuronCore
+            - NeuronDevice
+            - PodName
+        metric_name_selectors:
+          - container_neuroncore_utilization
+          - container_neuroncore_memory_usage_total
+          - container_neuroncore_memory_usage_constants
+          - container_neuroncore_memory_usage_model_code
+          - container_neuroncore_memory_usage_model_shared_scratchpad
+          - container_neuroncore_memory_usage_runtime_memory
+          - container_neuroncore_memory_usage_tensors
+      - dimensions:
+          - - ClusterName
+          - - ClusterName
+            - ContainerName
+            - Namespace
+            - PodName
+          - - ClusterName
+            - ContainerName
+            - FullPodName
+            - Namespace
+            - PodName
+          - - ClusterName
+            - ContainerName
+            - FullPodName
+            - Namespace
+            - NeuronDevice
+            - PodName
+        metric_name_selectors:
+          - container_neurondevice_hw_ecc_events_total
+      - dimensions:
+          - - ClusterName
+          - - ClusterName
+            - Namespace
+          - - ClusterName
+            - Namespace
+            - Service
+          - - ClusterName
+            - Namespace
+            - PodName
+          - - ClusterName
+            - FullPodName
+            - Namespace
+            - PodName
+          - - ClusterName
+            - FullPodName
+            - Namespace
+            - NeuronCore
+            - NeuronDevice
+            - PodName
+        metric_name_selectors:
+          - pod_neuroncore_utilization
+          - pod_neuroncore_memory_usage_total
+          - pod_neuroncore_memory_usage_constants
+          - pod_neuroncore_memory_usage_model_code
+          - pod_neuroncore_memory_usage_model_shared_scratchpad
+          - pod_neuroncore_memory_usage_runtime_memory
+          - pod_neuroncore_memory_usage_tensors
+      - dimensions:
+          - - ClusterName
+          - - ClusterName
+            - Namespace
+          - - ClusterName
+            - Namespace
+            - Service
+          - - ClusterName
+            - Namespace
+            - PodName
+          - - ClusterName
+            - FullPodName
+            - Namespace
+            - PodName
+          - - ClusterName
+            - FullPodName
+            - Namespace
+            - NeuronDevice
+            - PodName
+        metric_name_selectors:
+          - pod_neurondevice_hw_ecc_events_total
+      - dimensions:
+          - - ClusterName
+          - - ClusterName
+            - InstanceId
+            - NodeName
+          - - ClusterName
+            - InstanceId
+            - InstanceType
+            - NeuronCore
+            - NeuronDevice
+            - NodeName
+        metric_name_selectors:
+          - node_neuroncore_utilization
+          - node_neuroncore_memory_usage_total
+          - node_neuroncore_memory_usage_constants
+          - node_neuroncore_memory_usage_model_code
+          - node_neuroncore_memory_usage_model_shared_scratchpad
+          - node_neuroncore_memory_usage_runtime_memory
+          - node_neuroncore_memory_usage_tensors
+      - dimensions:
+          - - ClusterName
+          - - ClusterName
+            - InstanceId
+            - NodeName
+        metric_name_selectors:
+          - node_neuron_execution_errors_total
+          - node_neurondevice_runtime_memory_used_bytes
+          - node_neuron_execution_latency
+      - dimensions:
+          - - ClusterName
+          - - ClusterName
+            - InstanceId
+            - NodeName
+          - - ClusterName
+            - InstanceId
+            - NeuronDevice
+            - NodeName
+        metric_name_selectors:
+          - node_neurondevice_hw_ecc_events_total
+      - dimensions:
+          - - ClusterName
+          - - ClusterName
+            - ContainerName
+            - Namespace
+            - PodName
+          - - ClusterName
+            - ContainerName
+            - FullPodName
+            - Namespace
+            - PodName
+        metric_name_selectors:
+          - container_efa_rx_bytes
+          - container_efa_tx_bytes
+          - container_efa_rx_dropped
+          - container_efa_rdma_read_bytes
+          - container_efa_rdma_write_bytes
+          - container_efa_rdma_write_recv_bytes
+      - dimensions:
+          - - ClusterName
+          - - ClusterName
+            - Namespace
+          - - ClusterName
+            - Namespace
+            - Service
+          - - ClusterName
+            - Namespace
+            - PodName
+          - - ClusterName
+            - FullPodName
+            - Namespace
+            - PodName
+        metric_name_selectors:
+          - pod_efa_rx_bytes
+          - pod_efa_tx_bytes
+          - pod_efa_rx_dropped
+          - pod_efa_rdma_read_bytes
+          - pod_efa_rdma_write_bytes
+          - pod_efa_rdma_write_recv_bytes
+      - dimensions:
+          - - ClusterName
+          - - ClusterName
+            - InstanceId
+            - NodeName
+        metric_name_selectors:
+          - node_efa_rx_bytes
+          - node_efa_tx_bytes
+          - node_efa_rx_dropped
+          - node_efa_rdma_read_bytes
+          - node_efa_rdma_write_bytes
+          - node_efa_rdma_write_recv_bytes
+    metric_descriptors:
+      - metric_name: apiserver_admission_controller_admission_duration_seconds
+        overwrite: true
+        unit: Seconds
+      - metric_name: apiserver_admission_step_admission_duration_seconds
+        overwrite: true
+        unit: Seconds
+      - metric_name: apiserver_admission_webhook_admission_duration_seconds
+        overwrite: true
+        unit: Seconds
+      - metric_name: apiserver_current_inflight_requests
+        overwrite: true
+        unit: Count
+      - metric_name: apiserver_current_inqueue_requests
+        overwrite: true
+        unit: Count
+      - metric_name: apiserver_flowcontrol_rejected_requests_total
+        overwrite: true
+        unit: Count
+      - metric_name: apiserver_flowcontrol_request_concurrency_limit
+        overwrite: true
+        unit: Count
+      - metric_name: apiserver_longrunning_requests
+        overwrite: true
+        unit: Count
+      - metric_name: apiserver_request_duration_seconds
+        overwrite: true
+        unit: Seconds
+      - metric_name: apiserver_request_total
+        overwrite: true
+        unit: Count
+      - metric_name: apiserver_request_total_5xx
+        overwrite: true
+        unit: Count
+      - metric_name: apiserver_requested_deprecated_apis
+        overwrite: true
+        unit: Count
+      - metric_name: apiserver_storage_objects
+        overwrite: true
+        unit: Count
+      - metric_name: etcd_request_duration_seconds
+        overwrite: true
+        unit: Seconds
+      - metric_name: apiserver_storage_list_duration_seconds
+        overwrite: true
+        unit: Seconds
+      - metric_name: apiserver_storage_db_total_size_in_bytes
+        overwrite: true
+        unit: Bytes
+      - metric_name: apiserver_storage_size_bytes
+        overwrite: true
+        unit: Bytes
+      - metric_name: etcd_db_total_size_in_bytes
+        overwrite: true
+        unit: Bytes
+      - metric_name: rest_client_request_duration_seconds
+        overwrite: true
+        unit: Seconds
+      - metric_name: rest_client_requests_total
+        overwrite: true
+        unit: Count
+    middleware: agenthealth/logs
+    namespace: ContainerInsights
+    no_verify_ssl: false
+    num_workers: 8
+    output_destination: cloudwatch
+    parse_json_encoded_attr_values:
+      - Sources
+      - kubernetes
+    profile: default
+    proxy_address: ""
+    region: us-east-1
+    request_timeout_seconds: 30
+    resource_arn: ""
+    resource_to_telemetry_conversion:
+      enabled: true
+    retain_initial_value_of_delta_metric: false
+    role_arn: ""
+    shared_credentials_file:
+      - /root/.aws/credentials
+    version: "0"
 extensions:
-    agenthealth/logs:
-        is_usage_data_enabled: true
-        stats:
-            operations:
-                - PutLogEvents
-            usage_flags:
-                mode: OP
-                region_type: ACJ
+  agenthealth/logs:
+    is_usage_data_enabled: true
+    stats:
+      operations:
+        - PutLogEvents
+      usage_flags:
+        mode: OP
+        region_type: ACJ
 processors:
-    batch/containerinsights:
-        metadata_cardinality_limit: 1000
-        send_batch_max_size: 0
-        send_batch_size: 8192
-        timeout: 5s
-    batch/emf_logs:
-        metadata_cardinality_limit: 1000
-        send_batch_max_size: 0
-        send_batch_size: 8192
-        timeout: 5s
-    gpuattributes/containerinsights: {}
-    metricstransform/containerinsights:
-        transforms:
-            - action: insert
-              aggregation_type: ""
-              experimental_match_labels:
-                code: ^5.*
-              include: apiserver_request_total
-              match_type: regexp
-              new_name: apiserver_request_total_5xx
-              submatch_case: ""
-            - action: insert
-              aggregation_type: ""
-              include: DCGM_FI_DEV_FB_USED_PERCENT
-              match_type: ""
-              new_name: container_gpu_memory_utilization
-              operations:
-                - action: add_label
-                  aggregation_type: ""
-                  experimental_scale: 0
-                  label: ""
-                  label_value: ""
-                  new_label: Type
-                  new_value: ContainerGPU
-                - action: experimental_scale_value
-                  aggregation_type: ""
-                  experimental_scale: 100
-                  label: ""
-                  label_value: ""
-                  new_label: ""
-                  new_value: ""
-              submatch_case: ""
-            - action: insert
-              aggregation_type: ""
-              include: DCGM_FI_DEV_FB_USED_PERCENT
-              match_type: ""
-              new_name: pod_gpu_memory_utilization
-              operations:
-                - action: add_label
-                  aggregation_type: ""
-                  experimental_scale: 0
-                  label: ""
-                  label_value: ""
-                  new_label: Type
-                  new_value: PodGPU
-                - action: experimental_scale_value
-                  aggregation_type: ""
-                  experimental_scale: 100
-                  label: ""
-                  label_value: ""
-                  new_label: ""
-                  new_value: ""
-              submatch_case: ""
-            - action: insert
-              aggregation_type: ""
-              include: DCGM_FI_DEV_FB_USED_PERCENT
-              match_type: ""
-              new_name: node_gpu_memory_utilization
-              operations:
-                - action: add_label
-                  aggregation_type: ""
-                  experimental_scale: 0
-                  label: ""
-                  label_value: ""
-                  new_label: Type
-                  new_value: NodeGPU
-                - action: experimental_scale_value
-                  aggregation_type: ""
-                  experimental_scale: 100
-                  label: ""
-                  label_value: ""
-                  new_label: ""
-                  new_value: ""
-              submatch_case: ""
-            - action: insert
-              aggregation_type: ""
-              include: DCGM_FI_DEV_FB_USED
-              match_type: ""
-              new_name: container_gpu_memory_used
-              operations:
-                - action: add_label
-                  aggregation_type: ""
-                  experimental_scale: 0
-                  label: ""
-                  label_value: ""
-                  new_label: Type
-                  new_value: ContainerGPU
-                - action: experimental_scale_value
-                  aggregation_type: ""
-                  experimental_scale: 1.048576e+06
-                  label: ""
-                  label_value: ""
-                  new_label: ""
-                  new_value: ""
-              submatch_case: ""
-            - action: insert
-              aggregation_type: ""
-              include: DCGM_FI_DEV_FB_USED
-              match_type: ""
-              new_name: pod_gpu_memory_used
-              operations:
-                - action: add_label
-                  aggregation_type: ""
-                  experimental_scale: 0
-                  label: ""
-                  label_value: ""
-                  new_label: Type
-                  new_value: PodGPU
-                - action: experimental_scale_value
-                  aggregation_type: ""
-                  experimental_scale: 1.048576e+06
-                  label: ""
-                  label_value: ""
-                  new_label: ""
-                  new_value: ""
-              submatch_case: ""
-            - action: insert
-              aggregation_type: ""
-              include: DCGM_FI_DEV_FB_USED
-              match_type: ""
-              new_name: node_gpu_memory_used
-              operations:
-                - action: add_label
-                  aggregation_type: ""
-                  experimental_scale: 0
-                  label: ""
-                  label_value: ""
-                  new_label: Type
-                  new_value: NodeGPU
-                - action: experimental_scale_value
-                  aggregation_type: ""
-                  experimental_scale: 1.048576e+06
-                  label: ""
-                  label_value: ""
-                  new_label: ""
-                  new_value: ""
-              submatch_case: ""
-            - action: insert
-              aggregation_type: ""
-              include: DCGM_FI_DEV_FB_TOTAL
-              match_type: ""
-              new_name: container_gpu_memory_total
-              operations:
-                - action: add_label
-                  aggregation_type: ""
-                  experimental_scale: 0
-                  label: ""
-                  label_value: ""
-                  new_label: Type
-                  new_value: ContainerGPU
-                - action: experimental_scale_value
-                  aggregation_type: ""
-                  experimental_scale: 1.048576e+06
-                  label: ""
-                  label_value: ""
-                  new_label: ""
-                  new_value: ""
-              submatch_case: ""
-            - action: insert
-              aggregation_type: ""
-              include: DCGM_FI_DEV_FB_TOTAL
-              match_type: ""
-              new_name: pod_gpu_memory_total
-              operations:
-                - action: add_label
-                  aggregation_type: ""
-                  experimental_scale: 0
-                  label: ""
-                  label_value: ""
-                  new_label: Type
-                  new_value: PodGPU
-                - action: experimental_scale_value
-                  aggregation_type: ""
-                  experimental_scale: 1.048576e+06
-                  label: ""
-                  label_value: ""
-                  new_label: ""
-                  new_value: ""
-              submatch_case: ""
-            - action: insert
-              aggregation_type: ""
-              include: DCGM_FI_DEV_FB_TOTAL
-              match_type: ""
-              new_name: node_gpu_memory_total
-              operations:
-                - action: add_label
-                  aggregation_type: ""
-                  experimental_scale: 0
-                  label: ""
-                  label_value: ""
-                  new_label: Type
-                  new_value: NodeGPU
-                - action: experimental_scale_value
-                  aggregation_type: ""
-                  experimental_scale: 1.048576e+06
-                  label: ""
-                  label_value: ""
-                  new_label: ""
-                  new_value: ""
-              submatch_case: ""
-            - action: insert
-              aggregation_type: ""
-              include: DCGM_FI_DEV_GPU_TEMP
-              match_type: ""
-              new_name: container_gpu_temperature
-              operations:
-                - action: add_label
-                  aggregation_type: ""
-                  experimental_scale: 0
-                  label: ""
-                  label_value: ""
-                  new_label: Type
-                  new_value: ContainerGPU
-              submatch_case: ""
-            - action: insert
-              aggregation_type: ""
-              include: DCGM_FI_DEV_GPU_TEMP
-              match_type: ""
-              new_name: pod_gpu_temperature
-              operations:
-                - action: add_label
-                  aggregation_type: ""
-                  experimental_scale: 0
-                  label: ""
-                  label_value: ""
-                  new_label: Type
-                  new_value: PodGPU
-              submatch_case: ""
-            - action: insert
-              aggregation_type: ""
-              include: DCGM_FI_DEV_GPU_TEMP
-              match_type: ""
-              new_name: node_gpu_temperature
-              operations:
-                - action: add_label
-                  aggregation_type: ""
-                  experimental_scale: 0
-                  label: ""
-                  label_value: ""
-                  new_label: Type
-                  new_value: NodeGPU
-              submatch_case: ""
-            - action: insert
-              aggregation_type: ""
-              include: DCGM_FI_DEV_POWER_USAGE
-              match_type: ""
-              new_name: container_gpu_power_draw
-              operations:
-                - action: add_label
-                  aggregation_type: ""
-                  experimental_scale: 0
-                  label: ""
-                  label_value: ""
-                  new_label: Type
-                  new_value: ContainerGPU
-              submatch_case: ""
-            - action: insert
-              aggregation_type: ""
-              include: DCGM_FI_DEV_POWER_USAGE
-              match_type: ""
-              new_name: pod_gpu_power_draw
-              operations:
-                - action: add_label
-                  aggregation_type: ""
-                  experimental_scale: 0
-                  label: ""
-                  label_value: ""
-                  new_label: Type
-                  new_value: PodGPU
-              submatch_case: ""
-            - action: insert
-              aggregation_type: ""
-              include: DCGM_FI_DEV_POWER_USAGE
-              match_type: ""
-              new_name: node_gpu_power_draw
-              operations:
-                - action: add_label
-                  aggregation_type: ""
-                  experimental_scale: 0
-                  label: ""
-                  label_value: ""
-                  new_label: Type
-                  new_value: NodeGPU
-              submatch_case: ""
-            - action: insert
-              aggregation_type: ""
-              include: DCGM_FI_DEV_GPU_UTIL
-              match_type: ""
-              new_name: container_gpu_utilization
-              operations:
-                - action: add_label
-                  aggregation_type: ""
-                  experimental_scale: 0
-                  label: ""
-                  label_value: ""
-                  new_label: Type
-                  new_value: ContainerGPU
-              submatch_case: ""
-            - action: insert
-              aggregation_type: ""
-              include: DCGM_FI_DEV_GPU_UTIL
-              match_type: ""
-              new_name: pod_gpu_utilization
-              operations:
-                - action: add_label
-                  aggregation_type: ""
-                  experimental_scale: 0
-                  label: ""
-                  label_value: ""
-                  new_label: Type
-                  new_value: PodGPU
-              submatch_case: ""
-            - action: insert
-              aggregation_type: ""
-              include: DCGM_FI_DEV_GPU_UTIL
-              match_type: ""
-              new_name: node_gpu_utilization
-              operations:
-                - action: add_label
-                  aggregation_type: ""
-                  experimental_scale: 0
-                  label: ""
-                  label_value: ""
-                  new_label: Type
-                  new_value: NodeGPU
-              submatch_case: ""
-            - action: update
-              aggregation_type: ""
-              include: neuroncore_memory_usage_model_shared_scratchpad
-              match_type: ""
-              new_name: neuroncore_memory_usage_model_shared_scratchpad
-              operations: []
-              submatch_case: ""
-            - action: update
-              aggregation_type: ""
-              include: neuroncore_memory_usage_tensors
-              match_type: ""
-              new_name: neuroncore_memory_usage_tensors
-              operations: []
-              submatch_case: ""
-            - action: update
-              aggregation_type: ""
-              include: hardware_ecc_events_total
-              match_type: ""
-              new_name: neurondevice_hw_ecc_events
-              operations: []
-              submatch_case: ""
-            - action: update
-              aggregation_type: ""
-              include: execution_latency_seconds
-              match_type: ""
-              new_name: neuron_execution_latency
-              operations: []
-              submatch_case: ""
-            - action: update
-              aggregation_type: ""
-              include: execution_status_total
-              match_type: ""
-              new_name: neuron_execution_status
-              operations: []
-              submatch_case: ""
-            - action: update
-              aggregation_type: ""
-              include: neuron_runtime_memory_used_bytes
-              match_type: ""
-              new_name: neurondevice_runtime_memory_used_bytes
-              operations: []
-              submatch_case: ""
-            - action: update
-              aggregation_type: ""
-              include: neuroncore_memory_usage_model_code
-              match_type: ""
-              new_name: neuroncore_memory_usage_model_code
-              operations: []
-              submatch_case: ""
-            - action: update
-              aggregation_type: ""
-              include: neuroncore_memory_usage_runtime_memory
-              match_type: ""
-              new_name: neuroncore_memory_usage_runtime_memory
-              operations: []
-              submatch_case: ""
-            - action: update
-              aggregation_type: ""
-              include: neuroncore_utilization_ratio
-              match_type: ""
-              new_name: neuroncore_utilization
-              operations:
-                - action: experimental_scale_value
-                  aggregation_type: ""
-                  experimental_scale: 100
-                  label: ""
-                  label_value: ""
-                  new_label: ""
-                  new_value: ""
-              submatch_case: ""
-            - action: update
-              aggregation_type: ""
-              include: instance_info
-              match_type: ""
-              new_name: instance_info
-              operations: []
-              submatch_case: ""
-            - action: update
-              aggregation_type: ""
-              include: neuron_hardware
-              match_type: ""
-              new_name: neuron_hardware
-              operations: []
-              submatch_case: ""
-            - action: update
-              aggregation_type: ""
-              include: execution_errors_total
-              match_type: ""
-              new_name: neuron_execution_errors
-              operations: []
-              submatch_case: ""
-            - action: update
-              aggregation_type: ""
-              include: neuroncore_memory_usage_constants
-              match_type: ""
-              new_name: neuroncore_memory_usage_constants
-              operations: []
-              submatch_case: ""
+  batch/containerinsights:
+    metadata_cardinality_limit: 1000
+    send_batch_max_size: 0
+    send_batch_size: 8192
+    timeout: 5s
+  batch/emf_logs:
+    metadata_cardinality_limit: 1000
+    send_batch_max_size: 0
+    send_batch_size: 8192
+    timeout: 5s
+  gpuattributes/containerinsights: {}
+  metricstransform/containerinsights:
+    transforms:
+      - action: insert
+        aggregation_type: ""
+        experimental_match_labels:
+          code: ^5.*
+        include: apiserver_request_total
+        match_type: regexp
+        new_name: apiserver_request_total_5xx
+        submatch_case: ""
+      - action: insert
+        aggregation_type: ""
+        include: DCGM_FI_DEV_GPU_UTIL
+        match_type: ""
+        new_name: container_gpu_utilization
+        operations:
+          - action: add_label
+            aggregation_type: ""
+            experimental_scale: 0
+            label: ""
+            label_value: ""
+            new_label: Type
+            new_value: ContainerGPU
+        submatch_case: ""
+      - action: insert
+        aggregation_type: ""
+        include: DCGM_FI_DEV_GPU_UTIL
+        match_type: ""
+        new_name: pod_gpu_utilization
+        operations:
+          - action: add_label
+            aggregation_type: ""
+            experimental_scale: 0
+            label: ""
+            label_value: ""
+            new_label: Type
+            new_value: PodGPU
+        submatch_case: ""
+      - action: insert
+        aggregation_type: ""
+        include: DCGM_FI_DEV_GPU_UTIL
+        match_type: ""
+        new_name: node_gpu_utilization
+        operations:
+          - action: add_label
+            aggregation_type: ""
+            experimental_scale: 0
+            label: ""
+            label_value: ""
+            new_label: Type
+            new_value: NodeGPU
+        submatch_case: ""
+      - action: insert
+        aggregation_type: ""
+        include: DCGM_FI_DEV_FB_USED_PERCENT
+        match_type: ""
+        new_name: container_gpu_memory_utilization
+        operations:
+          - action: add_label
+            aggregation_type: ""
+            experimental_scale: 0
+            label: ""
+            label_value: ""
+            new_label: Type
+            new_value: ContainerGPU
+          - action: experimental_scale_value
+            aggregation_type: ""
+            experimental_scale: 100
+            label: ""
+            label_value: ""
+            new_label: ""
+            new_value: ""
+        submatch_case: ""
+      - action: insert
+        aggregation_type: ""
+        include: DCGM_FI_DEV_FB_USED_PERCENT
+        match_type: ""
+        new_name: pod_gpu_memory_utilization
+        operations:
+          - action: add_label
+            aggregation_type: ""
+            experimental_scale: 0
+            label: ""
+            label_value: ""
+            new_label: Type
+            new_value: PodGPU
+          - action: experimental_scale_value
+            aggregation_type: ""
+            experimental_scale: 100
+            label: ""
+            label_value: ""
+            new_label: ""
+            new_value: ""
+        submatch_case: ""
+      - action: insert
+        aggregation_type: ""
+        include: DCGM_FI_DEV_FB_USED_PERCENT
+        match_type: ""
+        new_name: node_gpu_memory_utilization
+        operations:
+          - action: add_label
+            aggregation_type: ""
+            experimental_scale: 0
+            label: ""
+            label_value: ""
+            new_label: Type
+            new_value: NodeGPU
+          - action: experimental_scale_value
+            aggregation_type: ""
+            experimental_scale: 100
+            label: ""
+            label_value: ""
+            new_label: ""
+            new_value: ""
+        submatch_case: ""
+      - action: insert
+        aggregation_type: ""
+        include: DCGM_FI_DEV_FB_USED
+        match_type: ""
+        new_name: container_gpu_memory_used
+        operations:
+          - action: add_label
+            aggregation_type: ""
+            experimental_scale: 0
+            label: ""
+            label_value: ""
+            new_label: Type
+            new_value: ContainerGPU
+          - action: experimental_scale_value
+            aggregation_type: ""
+            experimental_scale: 1.048576e+06
+            label: ""
+            label_value: ""
+            new_label: ""
+            new_value: ""
+        submatch_case: ""
+      - action: insert
+        aggregation_type: ""
+        include: DCGM_FI_DEV_FB_USED
+        match_type: ""
+        new_name: pod_gpu_memory_used
+        operations:
+          - action: add_label
+            aggregation_type: ""
+            experimental_scale: 0
+            label: ""
+            label_value: ""
+            new_label: Type
+            new_value: PodGPU
+          - action: experimental_scale_value
+            aggregation_type: ""
+            experimental_scale: 1.048576e+06
+            label: ""
+            label_value: ""
+            new_label: ""
+            new_value: ""
+        submatch_case: ""
+      - action: insert
+        aggregation_type: ""
+        include: DCGM_FI_DEV_FB_USED
+        match_type: ""
+        new_name: node_gpu_memory_used
+        operations:
+          - action: add_label
+            aggregation_type: ""
+            experimental_scale: 0
+            label: ""
+            label_value: ""
+            new_label: Type
+            new_value: NodeGPU
+          - action: experimental_scale_value
+            aggregation_type: ""
+            experimental_scale: 1.048576e+06
+            label: ""
+            label_value: ""
+            new_label: ""
+            new_value: ""
+        submatch_case: ""
+      - action: insert
+        aggregation_type: ""
+        include: DCGM_FI_DEV_FB_TOTAL
+        match_type: ""
+        new_name: container_gpu_memory_total
+        operations:
+          - action: add_label
+            aggregation_type: ""
+            experimental_scale: 0
+            label: ""
+            label_value: ""
+            new_label: Type
+            new_value: ContainerGPU
+          - action: experimental_scale_value
+            aggregation_type: ""
+            experimental_scale: 1.048576e+06
+            label: ""
+            label_value: ""
+            new_label: ""
+            new_value: ""
+        submatch_case: ""
+      - action: insert
+        aggregation_type: ""
+        include: DCGM_FI_DEV_FB_TOTAL
+        match_type: ""
+        new_name: pod_gpu_memory_total
+        operations:
+          - action: add_label
+            aggregation_type: ""
+            experimental_scale: 0
+            label: ""
+            label_value: ""
+            new_label: Type
+            new_value: PodGPU
+          - action: experimental_scale_value
+            aggregation_type: ""
+            experimental_scale: 1.048576e+06
+            label: ""
+            label_value: ""
+            new_label: ""
+            new_value: ""
+        submatch_case: ""
+      - action: insert
+        aggregation_type: ""
+        include: DCGM_FI_DEV_FB_TOTAL
+        match_type: ""
+        new_name: node_gpu_memory_total
+        operations:
+          - action: add_label
+            aggregation_type: ""
+            experimental_scale: 0
+            label: ""
+            label_value: ""
+            new_label: Type
+            new_value: NodeGPU
+          - action: experimental_scale_value
+            aggregation_type: ""
+            experimental_scale: 1.048576e+06
+            label: ""
+            label_value: ""
+            new_label: ""
+            new_value: ""
+        submatch_case: ""
+      - action: insert
+        aggregation_type: ""
+        include: DCGM_FI_DEV_GPU_TEMP
+        match_type: ""
+        new_name: container_gpu_temperature
+        operations:
+          - action: add_label
+            aggregation_type: ""
+            experimental_scale: 0
+            label: ""
+            label_value: ""
+            new_label: Type
+            new_value: ContainerGPU
+        submatch_case: ""
+      - action: insert
+        aggregation_type: ""
+        include: DCGM_FI_DEV_GPU_TEMP
+        match_type: ""
+        new_name: pod_gpu_temperature
+        operations:
+          - action: add_label
+            aggregation_type: ""
+            experimental_scale: 0
+            label: ""
+            label_value: ""
+            new_label: Type
+            new_value: PodGPU
+        submatch_case: ""
+      - action: insert
+        aggregation_type: ""
+        include: DCGM_FI_DEV_GPU_TEMP
+        match_type: ""
+        new_name: node_gpu_temperature
+        operations:
+          - action: add_label
+            aggregation_type: ""
+            experimental_scale: 0
+            label: ""
+            label_value: ""
+            new_label: Type
+            new_value: NodeGPU
+        submatch_case: ""
+      - action: insert
+        aggregation_type: ""
+        include: DCGM_FI_DEV_POWER_USAGE
+        match_type: ""
+        new_name: container_gpu_power_draw
+        operations:
+          - action: add_label
+            aggregation_type: ""
+            experimental_scale: 0
+            label: ""
+            label_value: ""
+            new_label: Type
+            new_value: ContainerGPU
+        submatch_case: ""
+      - action: insert
+        aggregation_type: ""
+        include: DCGM_FI_DEV_POWER_USAGE
+        match_type: ""
+        new_name: pod_gpu_power_draw
+        operations:
+          - action: add_label
+            aggregation_type: ""
+            experimental_scale: 0
+            label: ""
+            label_value: ""
+            new_label: Type
+            new_value: PodGPU
+        submatch_case: ""
+      - action: insert
+        aggregation_type: ""
+        include: DCGM_FI_DEV_POWER_USAGE
+        match_type: ""
+        new_name: node_gpu_power_draw
+        operations:
+          - action: add_label
+            aggregation_type: ""
+            experimental_scale: 0
+            label: ""
+            label_value: ""
+            new_label: Type
+            new_value: NodeGPU
+        submatch_case: ""
+      - action: insert
+        aggregation_type: ""
+        include: pod_gpu_limit
+        match_type: ""
+        new_name: node_gpu_limit
+        operations:
+          - action: add_label
+            aggregation_type: ""
+            experimental_scale: 0
+            label: ""
+            label_value: ""
+            new_label: Type
+            new_value: NodeGPU
+        submatch_case: ""
+      - action: insert
+        aggregation_type: ""
+        include: pod_gpu_limit
+        match_type: ""
+        new_name: cluster_gpu_limit
+        operations:
+          - action: add_label
+            aggregation_type: ""
+            experimental_scale: 0
+            label: ""
+            label_value: ""
+            new_label: Type
+            new_value: ClusterGPU
+        submatch_case: ""
+      - action: insert
+        aggregation_type: ""
+        include: pod_gpu_request
+        match_type: ""
+        new_name: node_gpu_request
+        operations:
+          - action: add_label
+            aggregation_type: ""
+            experimental_scale: 0
+            label: ""
+            label_value: ""
+            new_label: Type
+            new_value: NodeGPU
+        submatch_case: ""
+      - action: insert
+        aggregation_type: ""
+        include: pod_gpu_request
+        match_type: ""
+        new_name: cluster_gpu_request
+        operations:
+          - action: add_label
+            aggregation_type: ""
+            experimental_scale: 0
+            label: ""
+            label_value: ""
+            new_label: Type
+            new_value: ClusterGPU
+        submatch_case: ""
+      - action: insert
+        aggregation_type: ""
+        include: pod_gpu_total
+        match_type: ""
+        new_name: node_gpu_total
+        operations:
+          - action: add_label
+            aggregation_type: ""
+            experimental_scale: 0
+            label: ""
+            label_value: ""
+            new_label: Type
+            new_value: NodeGPU
+        submatch_case: ""
+      - action: insert
+        aggregation_type: ""
+        include: pod_gpu_total
+        match_type: ""
+        new_name: cluster_gpu_total
+        operations:
+          - action: add_label
+            aggregation_type: ""
+            experimental_scale: 0
+            label: ""
+            label_value: ""
+            new_label: Type
+            new_value: ClusterGPU
+        submatch_case: ""
+      - action: update
+        aggregation_type: ""
+        include: neuron_runtime_memory_used_bytes
+        match_type: ""
+        new_name: neurondevice_runtime_memory_used_bytes
+        operations: []
+        submatch_case: ""
+      - action: update
+        aggregation_type: ""
+        include: neuroncore_memory_usage_constants
+        match_type: ""
+        new_name: neuroncore_memory_usage_constants
+        operations: []
+        submatch_case: ""
+      - action: update
+        aggregation_type: ""
+        include: neuroncore_memory_usage_model_shared_scratchpad
+        match_type: ""
+        new_name: neuroncore_memory_usage_model_shared_scratchpad
+        operations: []
+        submatch_case: ""
+      - action: update
+        aggregation_type: ""
+        include: neuroncore_utilization_ratio
+        match_type: ""
+        new_name: neuroncore_utilization
+        operations:
+          - action: experimental_scale_value
+            aggregation_type: ""
+            experimental_scale: 100
+            label: ""
+            label_value: ""
+            new_label: ""
+            new_value: ""
+        submatch_case: ""
+      - action: update
+        aggregation_type: ""
+        include: instance_info
+        match_type: ""
+        new_name: instance_info
+        operations: []
+        submatch_case: ""
+      - action: update
+        aggregation_type: ""
+        include: neuron_hardware
+        match_type: ""
+        new_name: neuron_hardware
+        operations: []
+        submatch_case: ""
+      - action: update
+        aggregation_type: ""
+        include: hardware_ecc_events_total
+        match_type: ""
+        new_name: neurondevice_hw_ecc_events
+        operations: []
+        submatch_case: ""
+      - action: update
+        aggregation_type: ""
+        include: execution_status_total
+        match_type: ""
+        new_name: neuron_execution_status
+        operations: []
+        submatch_case: ""
+      - action: update
+        aggregation_type: ""
+        include: execution_latency_seconds
+        match_type: ""
+        new_name: neuron_execution_latency
+        operations: []
+        submatch_case: ""
+      - action: update
+        aggregation_type: ""
+        include: neuroncore_memory_usage_model_code
+        match_type: ""
+        new_name: neuroncore_memory_usage_model_code
+        operations: []
+        submatch_case: ""
+      - action: update
+        aggregation_type: ""
+        include: neuroncore_memory_usage_runtime_memory
+        match_type: ""
+        new_name: neuroncore_memory_usage_runtime_memory
+        operations: []
+        submatch_case: ""
+      - action: update
+        aggregation_type: ""
+        include: neuroncore_memory_usage_tensors
+        match_type: ""
+        new_name: neuroncore_memory_usage_tensors
+        operations: []
+        submatch_case: ""
+      - action: update
+        aggregation_type: ""
+        include: execution_errors_total
+        match_type: ""
+        new_name: neuron_execution_errors
+        operations: []
+        submatch_case: ""
 receivers:
-    awscontainerinsightreceiver:
-        accelerated_compute_metrics: true
-        add_container_name_metric_label: true
-        add_full_pod_name_metric_label: true
-        add_service_as_attribute: true
-        certificate_file_path: ""
-        cluster_name: TestCluster
-        collection_interval: 30s
-        container_orchestrator: eks
-        enable_control_plane_metrics: true
-        endpoint: ""
-        imds_retries: 2
-        leader_lock_name: cwagent-clusterleader
-        leader_lock_using_config_map_only: true
-        local_mode: true
-        max_retries: 0
-        no_verify_ssl: false
-        num_workers: 0
-        prefer_full_pod_name: true
-        profile: default
-        proxy_address: ""
-        region: us-east-1
-        request_timeout_seconds: 0
-        resource_arn: ""
-        role_arn: ""
-        shared_credentials_file:
-            - /root/.aws/credentials
-    tcplog/emf_logs:
-        encoding: utf-8
-        id: tcp_input
-        listen_address: 0.0.0.0:25888
-        operators: []
-        retry_on_failure:
-            enabled: false
-            initial_interval: 0s
-            max_elapsed_time: 0s
-            max_interval: 0s
-        type: tcp_input
-    udplog/emf_logs:
-        encoding: utf-8
-        id: udp_input
-        listen_address: 0.0.0.0:25888
-        multiline:
-            line_end_pattern: .^
-            line_start_pattern: ""
-            omit_pattern: false
-        operators: []
-        retry_on_failure:
-            enabled: false
-            initial_interval: 0s
-            max_elapsed_time: 0s
-            max_interval: 0s
-        type: udp_input
+  awscontainerinsightreceiver:
+    accelerated_compute_metrics: true
+    add_container_name_metric_label: true
+    add_full_pod_name_metric_label: true
+    add_service_as_attribute: true
+    certificate_file_path: ""
+    cluster_name: TestCluster
+    collection_interval: 30s
+    container_orchestrator: eks
+    enable_control_plane_metrics: true
+    endpoint: ""
+    imds_retries: 2
+    leader_lock_name: cwagent-clusterleader
+    leader_lock_using_config_map_only: true
+    local_mode: true
+    max_retries: 0
+    no_verify_ssl: false
+    num_workers: 0
+    prefer_full_pod_name: true
+    profile: default
+    proxy_address: ""
+    region: us-east-1
+    request_timeout_seconds: 0
+    resource_arn: ""
+    role_arn: ""
+    shared_credentials_file:
+      - /root/.aws/credentials
+  tcplog/emf_logs:
+    encoding: utf-8
+    id: tcp_input
+    listen_address: 0.0.0.0:25888
+    operators: []
+    retry_on_failure:
+      enabled: false
+      initial_interval: 0s
+      max_elapsed_time: 0s
+      max_interval: 0s
+    type: tcp_input
+  udplog/emf_logs:
+    encoding: utf-8
+    id: udp_input
+    listen_address: 0.0.0.0:25888
+    multiline:
+      line_end_pattern: .^
+      line_start_pattern: ""
+      omit_pattern: false
+    operators: []
+    retry_on_failure:
+      enabled: false
+      initial_interval: 0s
+      max_elapsed_time: 0s
+      max_interval: 0s
+    type: udp_input
 service:
-    extensions:
-        - agenthealth/logs
-    pipelines:
-        logs/emf_logs:
-            exporters:
-                - awscloudwatchlogs/emf_logs
-            processors:
-                - batch/emf_logs
-            receivers:
-                - tcplog/emf_logs
-                - udplog/emf_logs
-        metrics/containerinsights:
-            exporters:
-                - awsemf/containerinsights
-            processors:
-                - metricstransform/containerinsights
-                - gpuattributes/containerinsights
-                - batch/containerinsights
-            receivers:
-                - awscontainerinsightreceiver
-    telemetry:
-        logs:
-            development: false
-            disable_caller: false
-            disable_stacktrace: false
-            encoding: console
-            level: info
-            sampling:
-                enabled: true
-                initial: 2
-                thereafter: 500
-                tick: 10s
-        metrics:
-            address: ""
-            level: None
-        traces: {}
+  extensions:
+    - agenthealth/logs
+  pipelines:
+    logs/emf_logs:
+      exporters:
+        - awscloudwatchlogs/emf_logs
+      processors:
+        - batch/emf_logs
+      receivers:
+        - tcplog/emf_logs
+        - udplog/emf_logs
+    metrics/containerinsights:
+      exporters:
+        - awsemf/containerinsights
+      processors:
+        - metricstransform/containerinsights
+        - gpuattributes/containerinsights
+        - batch/containerinsights
+      receivers:
+        - awscontainerinsightreceiver
+  telemetry:
+    logs:
+      development: false
+      disable_caller: false
+      disable_stacktrace: false
+      encoding: console
+      level: info
+      sampling:
+        enabled: true
+        initial: 2
+        thereafter: 500
+        tick: 10s
+    metrics:
+      address: ""
+      level: None
+    traces: {}
diff --git a/translator/translate/otel/exporter/awsemf/kubernetes.go b/translator/translate/otel/exporter/awsemf/kubernetes.go
index c5db4fb26d..25367d7d0e 100644
--- a/translator/translate/otel/exporter/awsemf/kubernetes.go
+++ b/translator/translate/otel/exporter/awsemf/kubernetes.go
@@ -505,7 +505,15 @@ func getGPUMetricDeclarations(conf *confmap.Conf) []*awsemfexporter.MetricDeclar
 				},
 			},
 			{
-				Dimensions: [][]string{{"ClusterName", "NodeName", "InstanceId"}, {"ClusterName"}},
+				Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "Namespace"}, {"ClusterName", "Namespace", "Service"}, {"ClusterName", "Namespace", "PodName"}, {"ClusterName", "Namespace", "PodName", "FullPodName"}},
+				MetricNameSelectors: []string{
+					"pod_gpu_total",
+					"pod_gpu_request",
+					"pod_gpu_limit",
+				},
+			},
+			{
+				Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "NodeName", "InstanceId", "InstanceType"}},
 				MetricNameSelectors: []string{
 					"node_gpu_total",
 					"node_gpu_request",
@@ -515,8 +523,8 @@ func getGPUMetricDeclarations(conf *confmap.Conf) []*awsemfexporter.MetricDeclar
 			{
 				Dimensions: [][]string{{"ClusterName"}},
 				MetricNameSelectors: []string{
-					"cluster_gpu_request",
 					"cluster_gpu_total",
+					"cluster_gpu_request",
 				},
 			},
 		}...)
diff --git a/translator/translate/otel/exporter/awsemf/translator_test.go b/translator/translate/otel/exporter/awsemf/translator_test.go
index 73ed9d41d6..e591ee1c7e 100644
--- a/translator/translate/otel/exporter/awsemf/translator_test.go
+++ b/translator/translate/otel/exporter/awsemf/translator_test.go
@@ -418,7 +418,13 @@ func TestTranslator(t *testing.T) {
 						},
 					},
 					{
-						Dimensions: [][]string{{"ClusterName", "NodeName", "InstanceId"}, {"ClusterName"}},
+						Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "Namespace"}, {"ClusterName", "Namespace", "Service"}, {"ClusterName", "Namespace", "PodName"}, {"ClusterName", "Namespace", "PodName", "FullPodName"}},
+						MetricNameSelectors: []string{
+							"pod_gpu_total", "pod_gpu_request", "pod_gpu_limit",
+						},
+					},
+					{
+						Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "NodeName", "InstanceId", "InstanceType"}},
 						MetricNameSelectors: []string{
 							"node_gpu_total", "node_gpu_request", "node_gpu_limit",
 						},
@@ -426,7 +432,7 @@ func TestTranslator(t *testing.T) {
 					{
 						Dimensions: [][]string{{"ClusterName"}},
 						MetricNameSelectors: []string{
-							"cluster_gpu_request", "cluster_gpu_total",
+							"cluster_gpu_total", "cluster_gpu_request",
 						},
 					},
 					{
diff --git a/translator/translate/otel/processor/metricstransformprocessor/translator.go b/translator/translate/otel/processor/metricstransformprocessor/translator.go
index d50f5ded1e..e76a21d0cd 100644
--- a/translator/translate/otel/processor/metricstransformprocessor/translator.go
+++ b/translator/translate/otel/processor/metricstransformprocessor/translator.go
@@ -16,8 +16,6 @@ import (
 	"github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/receiver/awscontainerinsight"
 )
 
-const gpuLogSuffix = "GPU"
-
 var metricDuplicateTypes = []string{
 	containerinsightscommon.TypeGpuContainer,
 	containerinsightscommon.TypeGpuPod,
@@ -121,6 +119,36 @@ func (t *translator) Translate(conf *confmap.Conf) (component.Config, error) {
 			}
 		}
 
+		// replicate pod level nvidia gpu count metrics _limit, _request and _total for node and cluster
+		for _, m := range []string{containerinsightscommon.GpuLimit, containerinsightscommon.GpuRequest, containerinsightscommon.GpuTotal} {
+			transformRules = append(transformRules, []map[string]interface{}{
+				{
+					"include":  containerinsightscommon.MetricName(containerinsightscommon.TypePod, m),
+					"action":   "insert",
+					"new_name": containerinsightscommon.MetricName(containerinsightscommon.TypeNode, m),
+					"operations": append([]map[string]interface{}{
+						{
+							"action":    "add_label",
+							"new_label": containerinsightscommon.MetricType,
+							"new_value": containerinsightscommon.TypeGpuNode,
+						},
+					}),
+				},
+				{
+					"include":  containerinsightscommon.MetricName(containerinsightscommon.TypePod, m),
+					"action":   "insert",
+					"new_name": containerinsightscommon.MetricName(containerinsightscommon.TypeCluster, m),
+					"operations": append([]map[string]interface{}{
+						{
+							"action":    "add_label",
+							"new_label": containerinsightscommon.MetricType,
+							"new_value": containerinsightscommon.TypeGpuCluster,
+						},
+					}),
+				},
+			}...)
+		}
+
 		for oldName, newName := range renameMapForNeuronMonitor {
 			var operations []map[string]interface{}
 			if newName == containerinsightscommon.NeuronCoreUtilization {