From f507c65eda64bb58490041cb971b6114d6390977 Mon Sep 17 00:00:00 2001 From: Nick Pillitteri Date: Fri, 19 Apr 2024 14:46:50 -0400 Subject: [PATCH] mimir.rules.kubernetes: Don't retry non-recoverable errors Change event processing to immediately stop applying changes to Mimir in response to events that cause HTTP 4XX errors. These errors indicate the request is malformed and will never succeed so there's no point retrying it. Fixes #610 Signed-off-by: Nick Pillitteri --- .../mimir/rules/kubernetes/events.go | 6 +- internal/mimir/client/client.go | 5 +- internal/mimir/client/client_test.go | 83 +++++++++++++++++++ 3 files changed, 91 insertions(+), 3 deletions(-) diff --git a/internal/component/mimir/rules/kubernetes/events.go b/internal/component/mimir/rules/kubernetes/events.go index 471c12b28c..717c90c3d1 100644 --- a/internal/component/mimir/rules/kubernetes/events.go +++ b/internal/component/mimir/rules/kubernetes/events.go @@ -2,12 +2,14 @@ package rules import ( "context" + "errors" "fmt" "regexp" "time" "github.com/grafana/alloy/internal/alloy/logging/level" "github.com/grafana/alloy/internal/component/common/kubernetes" + "github.com/grafana/alloy/internal/mimir/client" "github.com/hashicorp/go-multierror" promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" "github.com/prometheus/prometheus/model/rulefmt" @@ -30,7 +32,7 @@ func (c *Component) eventLoop(ctx context.Context) { if err != nil { retries := c.queue.NumRequeues(evt) - if retries < 5 { + if retries < 5 && !errors.Is(err, client.ErrNonRecoverable) { c.metrics.eventsRetried.WithLabelValues(string(evt.Typ)).Inc() c.queue.AddRateLimited(evt) level.Error(c.log).Log( @@ -42,7 +44,7 @@ func (c *Component) eventLoop(ctx context.Context) { } else { c.metrics.eventsFailed.WithLabelValues(string(evt.Typ)).Inc() level.Error(c.log).Log( - "msg", "failed to process event, max retries exceeded", + "msg", "failed to process event, non-recoverable error or max retries exceeded", "retries", fmt.Sprintf("%d/5", retries), "err", err, ) diff --git a/internal/mimir/client/client.go b/internal/mimir/client/client.go index b3f86bf46b..d7ce888c09 100644 --- a/internal/mimir/client/client.go +++ b/internal/mimir/client/client.go @@ -11,7 +11,7 @@ import ( "net/url" "strings" - log "github.com/go-kit/log" + "github.com/go-kit/log" "github.com/grafana/alloy/internal/mimir/client/internal" "github.com/grafana/alloy/internal/useragent" "github.com/grafana/dskit/instrument" @@ -24,6 +24,7 @@ import ( var ( ErrNoConfig = errors.New("no config exists for this user") ErrResourceNotFound = errors.New("requested resource not found") + ErrNonRecoverable = errors.New("non-recoverable error response") ) // Config is used to configure a MimirClient. @@ -125,6 +126,8 @@ func checkResponse(r *http.Response) error { if r.StatusCode == http.StatusNotFound { return ErrResourceNotFound + } else if r.StatusCode/100 == 4 && r.StatusCode != http.StatusTooManyRequests { + return fmt.Errorf("%w: %s", ErrNonRecoverable, errMsg) } return errors.New(errMsg) diff --git a/internal/mimir/client/client_test.go b/internal/mimir/client/client_test.go index 262e9918a9..801ba10a0c 100644 --- a/internal/mimir/client/client_test.go +++ b/internal/mimir/client/client_test.go @@ -1,8 +1,11 @@ package client import ( + "errors" + "io" "net/http" "net/url" + "strings" "testing" "github.com/stretchr/testify/require" @@ -99,3 +102,83 @@ func TestBuildURL(t *testing.T) { }) } } + +func TestCheckResponseSuccess(t *testing.T) { + tc := []struct { + name string + body string + statusCode int + }{ + { + name: "returns nil error for 200 response", + body: "200 message!", + statusCode: http.StatusOK, + }, + { + name: "returns nil error for 204 response", + body: "", + statusCode: http.StatusNoContent, + }, + } + + for _, tt := range tc { + t.Run(tt.name, func(t *testing.T) { + response := &http.Response{ + Status: http.StatusText(tt.statusCode), + StatusCode: tt.statusCode, + Body: io.NopCloser(strings.NewReader(tt.body)), + } + + err := checkResponse(response) + require.NoError(t, err) + }) + } +} + +func TestCheckResponseErrors(t *testing.T) { + tc := []struct { + name string + body string + statusCode int + fatal bool + }{ + { + name: "returns correct error for 400 response", + body: "400 message!", + statusCode: http.StatusBadRequest, + fatal: true, + }, + { + name: "returns correct error for 404 response", + body: "404 message!", + statusCode: 404, + fatal: false, + }, + { + name: "returns correct error for 429 response", + body: "429 message!", + statusCode: http.StatusTooManyRequests, + fatal: false, + }, + { + name: "returns correct error for 500 response", + body: "500 message!", + statusCode: http.StatusInternalServerError, + fatal: false, + }, + } + + for _, tt := range tc { + t.Run(tt.name, func(t *testing.T) { + response := &http.Response{ + Status: http.StatusText(tt.statusCode), + StatusCode: tt.statusCode, + Body: io.NopCloser(strings.NewReader(tt.body)), + } + + err := checkResponse(response) + require.Error(t, err) + require.Equal(t, tt.fatal, errors.Is(err, ErrNonRecoverable), "%+v is not expected recoverable/non-recoverable", err) + }) + } +}