Skip to content

Commit

Permalink
nvidia gpu count metrics and bugfix (#1183)
Browse files Browse the repository at this point in the history
  • Loading branch information
movence authored May 29, 2024
1 parent 57742e8 commit 6d69782
Show file tree
Hide file tree
Showing 6 changed files with 1,497 additions and 1,269 deletions.
13 changes: 8 additions & 5 deletions plugins/processors/gpuattributes/processor.go
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ func (d *gpuAttributesProcessor) processMetrics(_ context.Context, md pmetric.Me
ils := ilms.At(j)
metrics := ils.Metrics()

d.filterGpuMetricsWithoutPodName(metrics)
d.filterGpuMetricsWithoutPodName(metrics, rs.Resource().Attributes())

metricsLength := metrics.Len()
for k := 0; k < metricsLength; k++ {
Expand Down Expand Up @@ -227,15 +227,15 @@ func (d *gpuAttributesProcessor) filterAttributes(attributes pcommon.Map, labels
}

// remove dcgm metrics that do not contain PodName attribute which means there is no workload associated to container/pod
func (d *gpuAttributesProcessor) filterGpuMetricsWithoutPodName(metrics pmetric.MetricSlice) {
func (d *gpuAttributesProcessor) filterGpuMetricsWithoutPodName(metrics pmetric.MetricSlice, resourceAttributes pcommon.Map) {
metrics.RemoveIf(func(m pmetric.Metric) bool {
isGpu := strings.Contains(m.Name(), gpuMetricIdentifier)
isContainerOrPod := strings.HasPrefix(m.Name(), gpuContainerMetricPrefix) || strings.HasPrefix(m.Name(), gpuPodMetricPrefix)

if !isGpu || !isContainerOrPod {
return false
}

_, hasPodAtResource := resourceAttributes.Get(internal.PodName)
var dps pmetric.NumberDataPointSlice
switch m.Type() {
case pmetric.MetricTypeGauge:
Expand All @@ -246,7 +246,10 @@ func (d *gpuAttributesProcessor) filterGpuMetricsWithoutPodName(metrics pmetric.
d.logger.Debug("Ignore unknown metric type", zap.String(containerinsightscommon.MetricType, m.Type().String()))
}

_, hasPodInfo := dps.At(0).Attributes().Get(internal.PodName)
return !hasPodInfo
dps.RemoveIf(func(dp pmetric.NumberDataPoint) bool {
_, hasPodInfo := dp.Attributes().Get(internal.PodName)
return !hasPodInfo && !hasPodAtResource
})
return dps.Len() == 0
})
}
236 changes: 158 additions & 78 deletions plugins/processors/gpuattributes/processor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,124 +19,204 @@ func TestProcessMetrics(t *testing.T) {
ctx := context.Background()

testcases := map[string]struct {
resource string
metrics pmetric.Metrics
wantCnt int
want map[string]string
resource string
metrics pmetric.Metrics
wantMetricCnt int
want []map[string]string
}{
"nonNode": {
metrics: generateMetrics("prefix", map[string]string{
"ClusterName": "cluster",
metrics: generateMetrics("prefix", []map[string]string{
{
"ClusterName": "cluster",
},
}),
wantCnt: 1,
want: map[string]string{
"ClusterName": "cluster",
wantMetricCnt: 1,
want: []map[string]string{
{
"ClusterName": "cluster",
},
},
},
"nodeDropSimple": {
metrics: generateMetrics("node", map[string]string{
"ClusterName": "cluster",
"Drop": "val",
metrics: generateMetrics("node", []map[string]string{
{
"ClusterName": "cluster",
"Drop": "val",
},
}),
wantCnt: 1,
want: map[string]string{
"ClusterName": "cluster",
wantMetricCnt: 1,
want: []map[string]string{
{
"ClusterName": "cluster",
},
},
},
"nodeDropJson": {
metrics: generateMetrics("node", map[string]string{
"ClusterName": "cluster",
"kubernetes": "{\"host\":\"test\"}",
metrics: generateMetrics("node", []map[string]string{
{
"ClusterName": "cluster",
"kubernetes": "{\"host\":\"test\"}",
},
}),
wantCnt: 1,
want: map[string]string{
"ClusterName": "cluster",
"kubernetes": "{\"host\":\"test\"}",
wantMetricCnt: 1,
want: []map[string]string{
{
"ClusterName": "cluster",
"kubernetes": "{\"host\":\"test\"}",
},
},
},
"nodeDropMixed": {
metrics: generateMetrics("node", map[string]string{
"ClusterName": "cluster",
"Drop": "val",
"kubernetes": "{\"host\":\"test\",\"b\":\"2\"}",
metrics: generateMetrics("node", []map[string]string{
{
"ClusterName": "cluster",
"Drop": "val",
"kubernetes": "{\"host\":\"test\",\"b\":\"2\"}",
},
}),
wantCnt: 1,
want: map[string]string{
"ClusterName": "cluster",
"kubernetes": "{\"host\":\"test\"}",
wantMetricCnt: 1,
want: []map[string]string{
{
"ClusterName": "cluster",
"kubernetes": "{\"host\":\"test\"}",
},
},
},
"dropPodWithoutPodName": {
metrics: generateMetrics("pod", map[string]string{
"ClusterName": "cluster",
"kubernetes": "{\"host\":\"test\",\"b\":\"2\"}",
metrics: generateMetrics("pod", []map[string]string{
{
"ClusterName": "cluster",
"kubernetes": "{\"host\":\"test\",\"b\":\"2\"}",
},
}),
wantCnt: 0,
want: map[string]string{},
wantMetricCnt: 0,
want: []map[string]string{},
},
"keepPodWithoutPodName": {
metrics: generateMetrics("pod", map[string]string{
"ClusterName": "cluster",
"PodName": "pod",
"kubernetes": "{\"host\":\"test\",\"b\":\"2\"}",
"keepPodWithPodName": {
metrics: generateMetrics("pod", []map[string]string{
{
"ClusterName": "cluster",
"PodName": "pod",
"kubernetes": "{\"host\":\"test\",\"b\":\"2\"}",
},
}),
wantCnt: 1,
want: map[string]string{
"ClusterName": "cluster",
"PodName": "pod",
"kubernetes": "{\"host\":\"test\"}",
wantMetricCnt: 1,
want: []map[string]string{
{
"ClusterName": "cluster",
"PodName": "pod",
"kubernetes": "{\"host\":\"test\"}",
},
},
},
"dropContainerWithoutPodName": {
metrics: generateMetrics("container", map[string]string{
"ClusterName": "cluster",
"kubernetes": "{\"host\":\"test\",\"b\":\"2\"}",
metrics: generateMetrics("container", []map[string]string{
{
"ClusterName": "cluster",
"kubernetes": "{\"host\":\"test\",\"b\":\"2\"}",
},
}),
wantCnt: 0,
want: map[string]string{},
wantMetricCnt: 0,
want: []map[string]string{},
},
"keepContainerWithoutPodName": {
metrics: generateMetrics("container", map[string]string{
"ClusterName": "cluster",
"PodName": "pod",
"kubernetes": "{\"host\":\"test\",\"b\":\"2\"}",
"keepContainerWithPodName": {
metrics: generateMetrics("container", []map[string]string{
{
"ClusterName": "cluster",
"PodName": "pod",
"kubernetes": "{\"host\":\"test\",\"b\":\"2\"}",
},
}),
wantCnt: 1,
want: map[string]string{
"ClusterName": "cluster",
"PodName": "pod",
"kubernetes": "{\"host\":\"test\"}",
wantMetricCnt: 1,
want: []map[string]string{
{
"ClusterName": "cluster",
"PodName": "pod",
"kubernetes": "{\"host\":\"test\"}",
},
},
},
"dropSingleDatapointWithoutPodName": {
metrics: generateMetrics("container", []map[string]string{
{
"ClusterName": "cluster",
"kubernetes": "{\"host\":\"test\",\"b\":\"2\"}",
},
{
"ClusterName": "cluster",
"PodName": "pod",
"kubernetes": "{\"host\":\"test\",\"b\":\"2\"}",
},
}),
wantMetricCnt: 1,
want: []map[string]string{
{
"ClusterName": "cluster",
"PodName": "pod",
"kubernetes": "{\"host\":\"test\"}",
},
},
},
"keepAllDatapoints": {
metrics: generateMetrics("container", []map[string]string{
{
"ClusterName": "cluster",
"PodName": "pod1",
"kubernetes": "{\"host\":\"test\",\"b\":\"2\"}",
},
{
"ClusterName": "cluster",
"PodName": "pod2",
"kubernetes": "{\"host\":\"test\",\"b\":\"2\"}",
},
}),
wantMetricCnt: 1,
want: []map[string]string{
{
"ClusterName": "cluster",
"PodName": "pod1",
"kubernetes": "{\"host\":\"test\"}",
},
{
"ClusterName": "cluster",
"PodName": "pod2",
"kubernetes": "{\"host\":\"test\"}",
},
},
},
}

for tname, tc := range testcases {
fmt.Printf("running %s\n", tname)
ms, _ := gp.processMetrics(ctx, tc.metrics)
assert.Equal(t, tc.wantCnt, ms.MetricCount())
if tc.wantCnt > 0 {
attrs := ms.ResourceMetrics().At(0).ScopeMetrics().At(0).Metrics().At(0).Gauge().DataPoints().At(0).Attributes()
assert.Equal(t, len(tc.want), attrs.Len())
for k, v := range tc.want {
got, ok := attrs.Get(k)
assert.True(t, ok)
assert.Equal(t, v, got.Str())
assert.Equal(t, tc.wantMetricCnt, ms.MetricCount())
if tc.wantMetricCnt > 0 {
dps := ms.ResourceMetrics().At(0).ScopeMetrics().At(0).Metrics().At(0).Gauge().DataPoints()
assert.Equal(t, len(tc.want), dps.Len())
for i, dim := range tc.want {
attrs := dps.At(i).Attributes()
assert.Equal(t, len(dim), attrs.Len())
for k, v := range dim {
got, ok := attrs.Get(k)
assert.True(t, ok)
assert.Equal(t, v, got.Str())
}
}
}
}
}

func generateMetrics(prefix string, dimensions map[string]string) pmetric.Metrics {
func generateMetrics(prefix string, dimensions []map[string]string) pmetric.Metrics {
md := pmetric.NewMetrics()

m := md.ResourceMetrics().AppendEmpty().ScopeMetrics().AppendEmpty().Metrics().AppendEmpty()
m.SetName(prefix + gpuMetricIdentifier)
gauge := m.SetEmptyGauge().DataPoints().AppendEmpty()
gauge.SetIntValue(10)

for k, v := range dimensions {
gauge.Attributes().PutStr(k, v)
ms := md.ResourceMetrics().AppendEmpty().ScopeMetrics().AppendEmpty().Metrics().AppendEmpty()
ms.SetName(prefix + gpuMetricIdentifier)
dps := ms.SetEmptyGauge().DataPoints()
for _, dim := range dimensions {
dp := dps.AppendEmpty()
dp.SetIntValue(10)
for k, v := range dim {
dp.Attributes().PutStr(k, v)
}
}

return md
}
Loading

0 comments on commit 6d69782

Please sign in to comment.