-
Notifications
You must be signed in to change notification settings - Fork 3.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
refactor: Add retention hours to discarded
metrics
#15875
base: main
Are you sure you want to change the base?
Changes from 6 commits
62b8cfc
e09aba2
2e81059
57976bc
86e1cc8
a936fdf
4d25e22
045d3bf
7e288ab
12ec27e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -453,7 +453,6 @@ func (d *Distributor) Push(ctx context.Context, req *logproto.PushRequest) (*log | |
// We use the heuristic of 1 sample per TS to size the array. | ||
// We also work out the hash value at the same time. | ||
streams := make([]KeyedStream, 0, len(req.Streams)) | ||
validationMetrics := newValidationMetrics() | ||
|
||
var validationErrors util.GroupedErrors | ||
|
||
|
@@ -493,7 +492,8 @@ func (d *Distributor) Push(ctx context.Context, req *logproto.PushRequest) (*log | |
} | ||
} | ||
|
||
tenantRetentionHours := d.validator.Limits.RetentionHours(tenantID, nil) | ||
tenantRetentionHours := util.RetentionHours(d.tenantsRetention.RetentionPeriodFor(tenantID, nil)) | ||
validationMetrics := newValidationMetrics(tenantRetentionHours) | ||
|
||
func() { | ||
sp := opentracing.SpanFromContext(ctx) | ||
|
@@ -524,7 +524,7 @@ func (d *Distributor) Push(ctx context.Context, req *logproto.PushRequest) (*log | |
continue | ||
} | ||
|
||
retentionHours := d.validator.Limits.RetentionHours(tenantID, lbs) | ||
retentionHours := util.RetentionHours(d.tenantsRetention.RetentionPeriodFor(tenantID, lbs)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit for a followup PR since we were resolving the retention for the streams before the changes. I think we could be able to speed up pushes by caching this. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think optimizing that would do more harm than good (based on the benchmarks). Doing the new retention evaluation on pushes didn't make a difference but adding the cache will likely do, either by the extra memory usage or by the extra complexity of adding of caching more stuff. |
||
|
||
if missing, lbsMissing := d.missingEnforcedLabels(lbs, tenantID); missing { | ||
err := fmt.Errorf(validation.MissingEnforcedLabelsErrorMsg, strings.Join(lbsMissing, ","), tenantID) | ||
|
@@ -620,7 +620,7 @@ func (d *Distributor) Push(ctx context.Context, req *logproto.PushRequest) (*log | |
} | ||
|
||
if block, until, retStatusCode := d.validator.ShouldBlockIngestion(validationContext, now); block { | ||
d.trackDiscardedData(ctx, req, validationContext, tenantID, validationMetrics, validation.BlockedIngestion, tenantRetentionHours) | ||
d.trackDiscardedData(ctx, req, validationContext, tenantID, validationMetrics, validation.BlockedIngestion) | ||
|
||
err = fmt.Errorf(validation.BlockedIngestionErrorMsg, tenantID, until.Format(time.RFC3339), retStatusCode) | ||
d.writeFailuresManager.Log(tenantID, err) | ||
|
@@ -635,7 +635,7 @@ func (d *Distributor) Push(ctx context.Context, req *logproto.PushRequest) (*log | |
} | ||
|
||
if !d.ingestionRateLimiter.AllowN(now, tenantID, validationMetrics.lineSize) { | ||
d.trackDiscardedData(ctx, req, validationContext, tenantID, validationMetrics, validation.RateLimited, tenantRetentionHours) | ||
d.trackDiscardedData(ctx, req, validationContext, tenantID, validationMetrics, validation.RateLimited) | ||
|
||
err = fmt.Errorf(validation.RateLimitedErrorMsg, tenantID, int(d.ingestionRateLimiter.Limit(now, tenantID)), validationMetrics.lineCount, validationMetrics.lineSize) | ||
d.writeFailuresManager.Log(tenantID, err) | ||
|
@@ -773,7 +773,6 @@ func (d *Distributor) trackDiscardedData( | |
tenantID string, | ||
validationMetrics validationMetrics, | ||
reason string, | ||
tenantRetentionHours string, | ||
) { | ||
for retentionHours, count := range validationMetrics.lineCountPerRetentionHours { | ||
validation.DiscardedSamples.WithLabelValues(reason, tenantID, retentionHours).Add(float64(count)) | ||
|
@@ -782,7 +781,7 @@ func (d *Distributor) trackDiscardedData( | |
|
||
if d.usageTracker != nil { | ||
for _, stream := range req.Streams { | ||
lbs, _, _, err := d.parseStreamLabels(validationContext, stream.Labels, stream, tenantRetentionHours) | ||
lbs, _, _, err := d.parseStreamLabels(validationContext, stream.Labels, stream, validationMetrics.tenantRetentionHours) | ||
if err != nil { | ||
continue | ||
} | ||
|
@@ -1264,25 +1263,3 @@ func newRingAndLifecycler(cfg RingConfig, instanceCount *atomic.Uint32, logger l | |
func (d *Distributor) HealthyInstancesCount() int { | ||
return int(d.healthyInstancesCount.Load()) | ||
} | ||
|
||
type validationMetrics struct { | ||
lineSizePerRetentionHours map[string]int | ||
lineCountPerRetentionHours map[string]int | ||
lineSize int | ||
lineCount int | ||
} | ||
|
||
func newValidationMetrics() validationMetrics { | ||
return validationMetrics{ | ||
lineSizePerRetentionHours: make(map[string]int), | ||
lineCountPerRetentionHours: make(map[string]int), | ||
} | ||
} | ||
|
||
func (v *validationMetrics) compute(entry logproto.Entry, retentionHours string) { | ||
totalEntrySize := util.EntryTotalSize(&entry) | ||
v.lineSizePerRetentionHours[retentionHours] += totalEntrySize | ||
v.lineCountPerRetentionHours[retentionHours]++ | ||
v.lineSize += totalEntrySize | ||
v.lineCount++ | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
package distributor | ||
|
||
import ( | ||
"github.com/grafana/loki/v3/pkg/logproto" | ||
"github.com/grafana/loki/v3/pkg/util" | ||
) | ||
|
||
type validationMetrics struct { | ||
lineSizePerRetentionHours map[string]int | ||
lineCountPerRetentionHours map[string]int | ||
lineSize int | ||
lineCount int | ||
tenantRetentionHours string | ||
} | ||
|
||
func newValidationMetrics(tenantRetentionHours string) validationMetrics { | ||
return validationMetrics{ | ||
lineSizePerRetentionHours: make(map[string]int), | ||
lineCountPerRetentionHours: make(map[string]int), | ||
tenantRetentionHours: tenantRetentionHours, | ||
} | ||
} | ||
|
||
func (v *validationMetrics) compute(entry logproto.Entry, retentionHours string) { | ||
totalEntrySize := util.EntryTotalSize(&entry) | ||
v.lineSizePerRetentionHours[retentionHours] += totalEntrySize | ||
v.lineCountPerRetentionHours[retentionHours]++ | ||
v.lineSize += totalEntrySize | ||
v.lineCount++ | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,6 +25,7 @@ import ( | |
|
||
"github.com/grafana/loki/v3/pkg/analytics" | ||
"github.com/grafana/loki/v3/pkg/chunkenc" | ||
"github.com/grafana/loki/v3/pkg/compactor/retention" | ||
"github.com/grafana/loki/v3/pkg/distributor/writefailures" | ||
"github.com/grafana/loki/v3/pkg/ingester/index" | ||
"github.com/grafana/loki/v3/pkg/ingester/wal" | ||
|
@@ -126,6 +127,8 @@ type instance struct { | |
schemaconfig *config.SchemaConfig | ||
|
||
customStreamsTracker push.UsageTracker | ||
|
||
tenantsRetention *retention.TenantsRetention | ||
} | ||
|
||
func newInstance( | ||
|
@@ -143,6 +146,7 @@ func newInstance( | |
streamRateCalculator *StreamRateCalculator, | ||
writeFailures *writefailures.Manager, | ||
customStreamsTracker push.UsageTracker, | ||
tenantsRetention *retention.TenantsRetention, | ||
) (*instance, error) { | ||
invertedIndex, err := index.NewMultiInvertedIndex(periodConfigs, uint32(cfg.IndexShards)) | ||
if err != nil { | ||
|
@@ -181,6 +185,8 @@ func newInstance( | |
schemaconfig: &c, | ||
|
||
customStreamsTracker: customStreamsTracker, | ||
|
||
tenantsRetention: tenantsRetention, | ||
} | ||
i.mapper = NewFPMapper(i.getLabelsFromFingerprint) | ||
|
||
|
@@ -290,12 +296,12 @@ func (i *instance) createStream(ctx context.Context, pushReqStream logproto.Stre | |
return nil, httpgrpc.Errorf(http.StatusBadRequest, "%s", err.Error()) | ||
} | ||
|
||
retentionHours := util.RetentionHours(i.tenantsRetention.RetentionPeriodFor(i.instanceID, labels)) | ||
|
||
if record != nil { | ||
err = i.streamCountLimiter.AssertNewStreamAllowed(i.instanceID) | ||
} | ||
|
||
retentionHours := i.limiter.limits.RetentionHours(i.instanceID, labels) | ||
|
||
if err != nil { | ||
return i.onStreamCreationError(ctx, pushReqStream, err, labels, retentionHours) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: This err handler looks a bit off-place. What about getting the retentionHours after getting the labels? labels, err := syntax.ParseLabels(pushReqStream.Labels)
if err != nil {
...
}
retentionHours := i.limiter.limits.RetentionHours(i.instanceID, labels)
if record != nil {
err = i.streamCountLimiter.AssertNewStreamAllowed(i.instanceID)
return i.onStreamCreationError(ctx, pushReqStream, err, labels, retentionHours)
} There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. good call, fixed it There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not quite my suggestion but it's fine since it's a nit There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. are you sure? I'm doing it immediately after parsing the labels. |
||
} | ||
|
@@ -377,7 +383,7 @@ func (i *instance) createStreamByFP(ls labels.Labels, fp model.Fingerprint) (*st | |
return nil, fmt.Errorf("failed to create stream for fingerprint: %w", err) | ||
} | ||
|
||
retentionHours := i.limiter.limits.RetentionHours(i.instanceID, ls) | ||
retentionHours := util.RetentionHours(i.tenantsRetention.RetentionPeriodFor(i.instanceID, ls)) | ||
s := newStream(chunkfmt, headfmt, i.cfg, i.limiter.rateLimitStrategy, i.instanceID, fp, sortedLabels, i.limiter.UnorderedWrites(i.instanceID), i.streamRateCalculator, i.metrics, i.writeFailures, i.configs, retentionHours) | ||
|
||
i.onStreamCreated(s) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: instead of calling
util.RetentionHours
what about havingtenantsRetention.RetentionHoursFor() string
that callsutil.RetentionHours
for the result oftenantsRetention.RetentionPeriodFor
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done