From 7a84661af72cec1b741e178b2c6885bb6a7d8950 Mon Sep 17 00:00:00 2001 From: Denny Pradipta Date: Fri, 31 Jan 2025 13:44:10 +0700 Subject: [PATCH] Feat: Improve Prometheus Metrics (#1338) * Feat: Improve Prometheus Metrics * Remove console.log * Fix test * Fix broken test * Feat: Updated Metrics * Add docs for default metrics * Clarify docs * Reverse the FAILED_ASSERTION_ALERT hacks --- docs/src/pages/guides/cli-options.md | 22 +- packages/notification/index.ts | 16 +- src/components/notification/index.ts | 10 +- src/components/probe/prober/http/index.ts | 14 +- src/components/probe/prober/index.ts | 6 + src/events/index.ts | 6 + src/loaders/index.ts | 4 + src/plugins/metrics/prometheus/collector.ts | 221 +++++++++++++------- src/plugins/metrics/prometheus/publisher.ts | 5 +- 9 files changed, 211 insertions(+), 93 deletions(-) diff --git a/docs/src/pages/guides/cli-options.md b/docs/src/pages/guides/cli-options.md index 7afb6ece1..fce083f3e 100644 --- a/docs/src/pages/guides/cli-options.md +++ b/docs/src/pages/guides/cli-options.md @@ -253,13 +253,21 @@ Then you can scrape the metrics from `http://localhost:3001/metrics`. Monika exposes [Prometheus default metrics](https://prometheus.io/docs/instrumenting/writing_clientlibs/#standard-and-runtime-collectors), [Node.js specific metrics](https://github.com/siimon/prom-client/tree/master/lib/metrics), and Monika probe metrics below. -| Metric Name | Type | Purpose | Label | -| -------------------------------------- | --------- | -------------------------------------------- | ------------------------------------------- | -| `monika_probes_total` | Gauge | Collect total probe | - | -| `monika_request_status_code_info` | Gauge | Collect HTTP status code | `id`, `name`, `url`, `method` | -| `monika_request_response_time_seconds` | Histogram | Collect duration of probe request in seconds | `id`, `name`, `url`, `method`, `statusCode` | -| `monika_request_response_size_bytes` | Gauge | Collect size of response size in bytes | `id`, `name`, `url`, `method`, `statusCode` | -| `monika_alert_total` | Counter | Collect total alert triggered | `id`, `name`, `url`, `method`, `alertQuery` | +| Metric Name | Type | Purpose | Labels | +| -------------------------------------- | --------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------- | +| `monika_alerts_triggered` | Counter | Indicates the count of incident alerts triggered | `id`, `name`, `url`, `method`, `alertQuery` | +| `monika_alerts_triggered_total` | Counter | Indicates the cumulative count of incident alerts triggered | - | +| `monika_probes_running` | Gauge | Indicates whether a probe is running (1) or idle (0). Running means the probe is currently sending requests, while idle means the probe is waiting for the next request to be sent. | +| `monika_probes_running_total` | Gauge | Indicates the total count of probes that are currently running. Running means the probe is currently sending requests. | - | +| `monika_probes_status` | Gauge | Indicates whether a probe is healthy (1) or is having an incident (0) | `id`, `name`, `url`, `method` | +| `monika_probes_total` | Gauge | Total count of all probes configured | - | +| `monika_request_response_size_bytes` | Gauge | Indicates the size of probe request's response in bytes | `id`, `name`, `url`, `method`, `statusCode`, `result` | +| `monika_request_response_time_seconds` | Histogram | Indicates the duration of the probe request in seconds | `id`, `name`, `url`, `method`, `statusCode`, `result` | +| `monika_request_status_code_info` | Gauge | Indicates the HTTP status code of the probe requests' response(s) | `id`, `name`, `url`, `method` | +| `monika_notifications_triggered` | Counter | Indicates the count of notifications triggered | `type`, `status` | +| `monika_notifications_triggered_total` | Counter | Indicates the cumulative count of notifications triggered | - | + +Aside from the above metrics, Monika also exposes [Prometheus default metrics](https://prometheus.io/docs/instrumenting/writing_clientlibs/#standard-and-runtime-collectors) and [Node.js specific metrics](https://github.com/siimon/prom-client/tree/master/lib/metrics) ## Repeat diff --git a/packages/notification/index.ts b/packages/notification/index.ts index 2876c0a9f..757551341 100644 --- a/packages/notification/index.ts +++ b/packages/notification/index.ts @@ -34,29 +34,33 @@ async function sendNotifications( notifications: Notification[], message: NotificationMessage, sender?: InputSender -): Promise { +): Promise<{ type: string; success: boolean }[]> { if (sender) { updateSender(sender) } - await Promise.all( + // Map notifications to an array of results + const results = await Promise.all( notifications.map(async ({ data, type }) => { const channel = channels[type] - try { if (!channel) { throw new Error('Notification channel is not available') } await channel.send(data, message) + return { type, success: true } } catch (error: unknown) { - const message = getErrorMessage(error) - throw new Error( - `Failed to send message using ${type}, please check your ${type} notification config.\nMessage: ${message}` + const errorMessage = getErrorMessage(error) + console.error( + `Failed to send message using ${type}, please check your ${type} notification config.\nMessage: ${errorMessage}` ) + return { type, success: false } } }) ) + + return results } export { sendNotifications } diff --git a/src/components/notification/index.ts b/src/components/notification/index.ts index 0a4564f24..d1ae26fe4 100644 --- a/src/components/notification/index.ts +++ b/src/components/notification/index.ts @@ -22,11 +22,13 @@ * SOFTWARE. * **********************************************************************************/ +import { getEventEmitter } from '../../utils/events' import { ValidatedResponse } from '../../plugins/validate-response' import getIp from '../../utils/ip' import { getMessageForAlert } from './alert-message' import { sendNotifications } from '@hyperjumptech/monika-notification' import type { Notification } from '@hyperjumptech/monika-notification' +import events from '../../events' type SendAlertsProps = { probeID: string @@ -54,5 +56,11 @@ export async function sendAlerts({ response: validation.response, }) - return sendNotifications(notifications, message) + const results = await sendNotifications(notifications, message) + for (const result of results) { + getEventEmitter().emit(events.notifications.sent, { + type: result.type, + status: result.success ? 'success' : 'failed', + }) + } } diff --git a/src/components/probe/prober/http/index.ts b/src/components/probe/prober/http/index.ts index cf10de425..0155ade56 100644 --- a/src/components/probe/prober/http/index.ts +++ b/src/components/probe/prober/http/index.ts @@ -159,6 +159,12 @@ export class HTTPProber extends BaseProber { response, }) + getEventEmitter().emit(events.probe.status.changed, { + probe: this.probeConfig, + requestIndex, + status: 'up', + }) + this.logMessage( true, getProbeResultMessage({ @@ -226,10 +232,16 @@ export class HTTPProber extends BaseProber { } const alertId = getAlertID(url, validation, probeID) + getEventEmitter().emit(events.probe.status.changed, { + probe: this.probeConfig, + requestIndex, + status: 'down', + }) + getEventEmitter().emit(events.probe.alert.triggered, { probe: this.probeConfig, requestIndex, - alertQuery: '', + alertQuery: triggeredAlert, }) addIncident({ diff --git a/src/components/probe/prober/index.ts b/src/components/probe/prober/index.ts index b6731a175..1c24c1129 100644 --- a/src/components/probe/prober/index.ts +++ b/src/components/probe/prober/index.ts @@ -134,6 +134,7 @@ export abstract class BaseProber implements Prober { // this probe is definitely in incident state because of fail assertion, so send notification, etc. this.handleFailedProbe(probeResults) + return } @@ -148,6 +149,11 @@ export abstract class BaseProber implements Prober { requestIndex: index, response: requestResponse, }) + getEventEmitter().emit(events.probe.status.changed, { + probe: this.probeConfig, + requestIndex: index, + status: 'up', + }) logResponseTime(requestResponse.responseTime) if ( diff --git a/src/events/index.ts b/src/events/index.ts index 2bec870c5..f698e8049 100644 --- a/src/events/index.ts +++ b/src/events/index.ts @@ -34,6 +34,9 @@ export default { sanitized: 'CONFIG_SANITIZED', updated: 'CONFIG_UPDATED', }, + notifications: { + sent: 'NOTIFICATIONS_SENT', + }, probe: { alert: { triggered: 'PROBE_ALERT_TRIGGERED', @@ -46,5 +49,8 @@ export default { notification: { willSend: 'PROBE_NOTIFICATION_WILL_SEND', }, + status: { + changed: 'PROBE_STATUS_CHANGED', + }, }, } diff --git a/src/loaders/index.ts b/src/loaders/index.ts index 3f079b351..395b33974 100644 --- a/src/loaders/index.ts +++ b/src/loaders/index.ts @@ -82,6 +82,8 @@ function initPrometheus(prometheusPort: number) { decrementProbeRunningTotal, incrementProbeRunningTotal, resetProbeRunningTotal, + collectProbeStatus, + collectNotificationSentMetrics, } = new PrometheusCollector() // collect prometheus metrics @@ -93,6 +95,8 @@ function initPrometheus(prometheusPort: number) { eventEmitter.on(events.probe.ran, incrementProbeRunningTotal) eventEmitter.on(events.probe.finished, decrementProbeRunningTotal) eventEmitter.on(events.config.updated, resetProbeRunningTotal) + eventEmitter.on(events.probe.status.changed, collectProbeStatus) + eventEmitter.on(events.notifications.sent, collectNotificationSentMetrics) startPrometheusMetricsServer(prometheusPort) } diff --git a/src/plugins/metrics/prometheus/collector.ts b/src/plugins/metrics/prometheus/collector.ts index e4e47dc6f..e944a8c30 100644 --- a/src/plugins/metrics/prometheus/collector.ts +++ b/src/plugins/metrics/prometheus/collector.ts @@ -35,16 +35,20 @@ import type { ProbeRequestResponse } from '../../../interfaces/request' type PrometheusCustomCollector = { statusCode: Gauge<'id' | 'name' | 'url' | 'method'> - probeResult: Gauge<'id' | 'name' | 'url' | 'method'> - probeRunningTotal: Gauge<'id'> responseTime: Histogram< 'id' | 'name' | 'url' | 'method' | 'statusCode' | 'result' > responseSize: Gauge< 'id' | 'name' | 'url' | 'method' | 'statusCode' | 'result' > - alertTriggeredTotal: Counter<'id' | 'name' | 'url' | 'method' | 'alertQuery'> + alertsTriggered: Counter<'id' | 'name' | 'url' | 'method' | 'alertQuery'> + alertsTriggeredTotal: Counter + probesStatus: Gauge<'id' | 'name' | 'url' | 'method'> + probesRunningTotal: Gauge<'id'> + probesRunning: Gauge probesTotal: Gauge + notificationsTriggered: Counter<'type' | 'status'> + notificationsTriggeredTotal: Counter } type ProbeResult = { @@ -61,19 +65,36 @@ export class PrometheusCollector { register.clear() // register metric collector - const statusCode = new Gauge({ - name: 'monika_request_status_code_info', - help: 'HTTP status code', - labelNames: ['id', 'name', 'url', 'method'] as const, + const alertsTriggered = new Counter({ + name: 'monika_alerts_triggered', + help: 'Indicates the count of alerts triggered', + labelNames: ['id', 'name', 'url', 'method', 'alertQuery'] as const, + }) + const alertsTriggeredTotal = new Counter({ + name: 'monika_alerts_triggered_total', + help: 'Indicates the cumulative count of alerts triggered', + }) + const probesRunning = new Gauge({ + name: 'monika_probes_running', + help: 'Indicates whether a a probe is running (1) or idle (0)', + labelNames: ['id'] as const, + }) + const probesRunningTotal = new Gauge({ + name: 'monika_probes_running_total', + help: 'Indicates the total count of probes that are currently running checks', }) - const probeResult = new Gauge({ - name: 'monika_probe_result', - help: 'Probe result: -1=unknown, 0=failed, 1=success', + const probesStatus = new Gauge({ + name: 'monika_probes_status', + help: 'Indicates whether a probe is healthy (1) or is having an incident (0)', labelNames: ['id', 'name', 'url', 'method'] as const, }) - const responseTime = new Histogram({ - name: 'monika_request_response_time_seconds', - help: 'Duration of probe request in seconds', + const probesTotal = new Gauge({ + name: 'monika_probes_total', + help: 'Total count of all probes configured', + }) + const responseSize = new Gauge({ + name: 'monika_request_response_size_bytes', + help: "Indicates the size of probe request's response size in bytes", labelNames: [ 'id', 'name', @@ -83,9 +104,9 @@ export class PrometheusCollector { 'result', ] as const, }) - const responseSize = new Gauge({ - name: 'monika_request_response_size_bytes', - help: 'Size of response size in bytes', + const responseTime = new Histogram({ + name: 'monika_request_response_time_seconds', + help: 'Indicates the duration of the probe request in seconds', labelNames: [ 'id', 'name', @@ -95,67 +116,39 @@ export class PrometheusCollector { 'result', ] as const, }) - const alertTriggeredTotal = new Counter({ - name: 'monika_alert_total', - help: 'Total alert triggered', - labelNames: ['id', 'name', 'url', 'method', 'alertQuery'] as const, + const statusCode = new Gauge({ + name: 'monika_request_status_code_info', + help: "Indicates the HTTP status code of the probe requests' response(s)", + labelNames: ['id', 'name', 'url', 'method'] as const, }) - const probeRunningTotal = new Gauge({ - name: 'monika_probe_running_total', - help: 'Total of probe running', - labelNames: ['id'] as const, + const notificationsTriggered = new Counter({ + name: 'monika_notifications_triggered', + help: 'Indicates the count of notifications triggered', + labelNames: ['type', 'status'] as const, }) - const probesTotal = new Gauge({ - name: 'monika_probes_total', - help: 'Total of all probe', + const notificationsTriggeredTotal = new Counter({ + name: 'monika_notifications_triggered_total', + help: 'Indicates the cumulative count of notifications triggered', }) // register and collect default Node.js metrics collectDefaultMetrics({ register }) prometheusCustomCollector = { - statusCode, - probeResult, - responseTime, - responseSize, - alertTriggeredTotal, - probeRunningTotal, + alertsTriggered, + alertsTriggeredTotal, + probesRunningTotal, + probesRunning, + probesStatus, probesTotal, + responseSize, + responseTime, + statusCode, + notificationsTriggered, + notificationsTriggeredTotal, } } - collectProbeTotal(total: number): void { - if (!prometheusCustomCollector) { - throw new Error('Prometheus collector is not registered') - } - - prometheusCustomCollector.probesTotal.set(total) - } - - decrementProbeRunningTotal(id: string) { - if (!prometheusCustomCollector) { - throw new Error('Prometheus collector is not registered') - } - - prometheusCustomCollector.probeRunningTotal.labels(id).dec() - } - - incrementProbeRunningTotal(id: string) { - if (!prometheusCustomCollector) { - throw new Error('Prometheus collector is not registered') - } - - prometheusCustomCollector.probeRunningTotal.labels(id).inc() - } - - resetProbeRunningTotal() { - if (!prometheusCustomCollector) { - throw new Error('Prometheus collector is not registered') - } - - prometheusCustomCollector.probeRunningTotal.reset() - } - collectProbeRequestMetrics(probeResult: ProbeResult): void { if (!prometheusCustomCollector) { throw new Error('Prometheus collector is not registered') @@ -188,7 +181,6 @@ export class PrometheusCollector { } const { statusCode, - probeResult: probeResultCollector, responseTime: resposeTimeCollector, responseSize, } = prometheusCustomCollector @@ -202,18 +194,46 @@ export class PrometheusCollector { method: method ?? 'GET', }) .set(status) - probeResultCollector - ?.labels({ - id, - name, - url, - method: method ?? 'GET', - }) - .set(result) resposeTimeCollector?.labels(labels).observe(responseTimeInSecond) responseSize?.labels(labels).set(responseSizeBytes || 0) } + collectProbeStatus( + probeResult: { status: 'up' | 'down' } & Omit + ): void { + if (!prometheusCustomCollector) { + throw new Error('Prometheus collector is not registered') + } + + const { probe, requestIndex, status } = probeResult + const { id, name, requests } = probe + + if (!requests || requests.length === 0) { + return + } + + const request = requests[requestIndex] + const { method, url } = request + const labels = { + id, + name, + url, + method: method ?? 'GET', + } + const { probesStatus } = prometheusCustomCollector + + // collect metrics + probesStatus?.labels(labels).set(status === 'up' ? 1 : 0) + } + + collectProbeTotal(total: number): void { + if (!prometheusCustomCollector) { + throw new Error('Prometheus collector is not registered') + } + + prometheusCustomCollector.probesTotal.set(total) + } + collectTriggeredAlert( probeResult: { alertQuery: string } & Omit ): void { @@ -235,11 +255,58 @@ export class PrometheusCollector { name, url, method: method ?? 'GET', - alertQuery, + alertQuery: JSON.stringify(alertQuery), + } + const { alertsTriggered, alertsTriggeredTotal } = prometheusCustomCollector + + // collect metrics + alertsTriggered?.labels(labels).inc() + alertsTriggeredTotal?.inc() + } + + decrementProbeRunningTotal(id: string) { + if (!prometheusCustomCollector) { + throw new Error('Prometheus collector is not registered') } - const { alertTriggeredTotal } = prometheusCustomCollector + + prometheusCustomCollector.probesRunning.labels(id).dec() + prometheusCustomCollector.probesRunningTotal.dec() + } + + incrementProbeRunningTotal(id: string) { + if (!prometheusCustomCollector) { + throw new Error('Prometheus collector is not registered') + } + + prometheusCustomCollector.probesRunning.labels(id).inc() + prometheusCustomCollector.probesRunningTotal.inc() + } + + resetProbeRunningTotal() { + if (!prometheusCustomCollector) { + throw new Error('Prometheus collector is not registered') + } + + prometheusCustomCollector.probesRunning.reset() + prometheusCustomCollector.probesRunningTotal.reset() + } + + collectNotificationSentMetrics({ + type, + status, + }: { + type: string + status: 'success' | 'failed' + }) { + if (!prometheusCustomCollector) { + throw new Error('Prometheus collector is not registered') + } + + const { notificationsTriggered, notificationsTriggeredTotal } = + prometheusCustomCollector // collect metrics - alertTriggeredTotal?.labels(labels).inc() + notificationsTriggered.labels({ type, status }).inc() + notificationsTriggeredTotal.inc() } } diff --git a/src/plugins/metrics/prometheus/publisher.ts b/src/plugins/metrics/prometheus/publisher.ts index 59ab5bcd5..01ae5fe5b 100644 --- a/src/plugins/metrics/prometheus/publisher.ts +++ b/src/plugins/metrics/prometheus/publisher.ts @@ -45,7 +45,10 @@ export function startPrometheusMetricsServer(port: number): void { try { const prometheusMetrics = await register.metrics() - res.status(200).end(prometheusMetrics) + res + .status(200) + .header('Content-Type', register.contentType) + .end(prometheusMetrics) } catch (error: unknown) { res.status(500).json({ message: getErrorMessage(error) }) }