From 2fd817bed8de8af85bf1bc2e0453a5cacdc7f44f Mon Sep 17 00:00:00 2001 From: ulya-sidorina Date: Wed, 22 Jan 2025 22:19:17 +0100 Subject: [PATCH] feat(metrics): add ElapsedTimeSinceLastBackup and RPODuration gauges for schedules --- internal/metrics/metrics.go | 46 +++++++++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 4 deletions(-) diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go index 97b65684..f4cff0fe 100644 --- a/internal/metrics/metrics.go +++ b/internal/metrics/metrics.go @@ -75,10 +75,12 @@ type MetricsRegistryImpl struct { backupsSucceededCount *prometheus.GaugeVec // schedule metrics - scheduleActionFailedCount *prometheus.CounterVec - scheduleActionSucceededCount *prometheus.CounterVec - scheduleLastBackupTimestamp *prometheus.GaugeVec - scheduleRPOMarginRatio *prometheus.GaugeVec + scheduleActionFailedCount *prometheus.CounterVec + scheduleActionSucceededCount *prometheus.CounterVec + scheduleLastBackupTimestamp *prometheus.GaugeVec + scheduleRPOMarginRatio *prometheus.GaugeVec + scheduleElapsedTimeSinceLastBackup *prometheus.GaugeVec + scheduleRPODuration *prometheus.GaugeVec } func (s *MetricsRegistryImpl) ReportHealthCheck() { @@ -241,6 +243,13 @@ func (s *MetricsRegistryImpl) IncScheduleCounters(schedule *types.BackupSchedule schedule.ID, scheduleNameLabel, ).Set(float64(schedule.RecoveryPoint.Unix())) + + s.scheduleElapsedTimeSinceLastBackup.WithLabelValues( + schedule.ContainerID, + schedule.DatabaseName, + schedule.ID, + scheduleNameLabel, + ).Set(s.clock.Since(*schedule.RecoveryPoint).Seconds()) } else if schedule.Audit != nil && schedule.Audit.CreatedAt != nil { // Report schedule creation time as last backup time if no backups were made s.scheduleLastBackupTimestamp.WithLabelValues( @@ -249,7 +258,24 @@ func (s *MetricsRegistryImpl) IncScheduleCounters(schedule *types.BackupSchedule schedule.ID, scheduleNameLabel, ).Set(float64(schedule.Audit.CreatedAt.AsTime().Unix())) + + s.scheduleElapsedTimeSinceLastBackup.WithLabelValues( + schedule.ContainerID, + schedule.DatabaseName, + schedule.ID, + scheduleNameLabel, + ).Set(s.clock.Since(schedule.Audit.CreatedAt.AsTime()).Seconds()) } + + if schedule.ScheduleSettings.RecoveryPointObjective != nil { + s.scheduleRPODuration.WithLabelValues( + schedule.ContainerID, + schedule.DatabaseName, + schedule.ID, + scheduleNameLabel, + ).Set(float64(schedule.ScheduleSettings.RecoveryPointObjective.Seconds)) + } + info := schedule.GetBackupInfo(s.clock) if info != nil { s.scheduleRPOMarginRatio.WithLabelValues( @@ -392,6 +418,18 @@ func newMetricsRegistry(ctx context.Context, wg *sync.WaitGroup, cfg *config.Met Help: "if RPO is set for schedule, calculates a ratio to which RPO is satisfied", }, []string{"container_id", "database", "schedule_id", "schedule_name"}) + s.scheduleElapsedTimeSinceLastBackup = promauto.With(s.reg).NewGaugeVec(prometheus.GaugeOpts{ + Subsystem: "schedules", + Name: "elapsed_seconds_since_last_backup", + Help: "Amount of time elapsed since last successful backup for this schedule", + }, []string{"container_id", "database", "schedule_id", "schedule_name"}) + + s.scheduleRPODuration = promauto.With(s.reg).NewGaugeVec(prometheus.GaugeOpts{ + Subsystem: "schedules", + Name: "rpo_duration_seconds", + Help: "Maximum length of time permitted, that backup can be restored for this schedule", + }, []string{"container_id", "database", "schedule_id", "schedule_name"}) + mux := http.NewServeMux() mux.Handle("/metrics", promhttp.HandlerFor(s.reg, promhttp.HandlerOpts{Registry: s.reg}))