diff --git a/README.md b/README.md index faadc645f..39b6ff714 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,7 @@ Try [a binary release](https://github.com/osrg/gobgp/releases/latest). - [EVPN](docs/sources/evpn.md) - [Flowspec](docs/sources/flowspec.md) - [RPKI](docs/sources/rpki.md) +- [Metrics](docs/sources/metrics.md) - [Managing GoBGP with your favorite language with gRPC](docs/sources/grpc-client.md) - Go Native BGP Library - [Basics](docs/sources/lib.md) diff --git a/docs/sources/metrics.md b/docs/sources/metrics.md new file mode 100644 index 000000000..72cf442f5 --- /dev/null +++ b/docs/sources/metrics.md @@ -0,0 +1,103 @@ +# Metrics + +This page explains how to use the Prometheus metrics generated by GoBGP. + +## Prerequisites + +Assume you finished [Getting Started](getting-started.md). + +## Contents + +- [Scraping the metrics](#scraping-the-metrics) +- [Exported metrics](#exported-metrics) +- [Label values](#label-values) +- [Grafana dashboard](#grafana-dashboard) + +## Scraping the metrics + +GoBGP exposes the metric endpoint and the pprof endpoint using the same startup flag: `--pprof-host`. By default, it listens locally on port `6060`. +The path on which the Prometheus metrics are exposed by default is `/metrics` and can be overriden using `--metrics-path`. Ensure `--pprof-disable` is not specified, or it will disable both `pprof` and +the metric endpoint, unless an override for the path is specified through `--metrics-path` (in that case, metrics will be exposed anyways). + +Manually scrape the metrics to verify your setup is working: + +```bash +curl -s localhost:6060/metrics + +# HELP bgp_routes_received Number of routes received from peer +# TYPE bgp_routes_received gauge +bgp_routes_received{peer="100.109.96.78",route_family="l2vpn-evpn"} 3 +bgp_routes_received{peer="100.125.49.61",route_family="l2vpn-evpn"} 3 +bgp_routes_received{peer="100.67.112.61",route_family="l2vpn-evpn"} 3 +bgp_routes_received{peer="100.75.128.54",route_family="l2vpn-evpn"} 2 +# HELP bgp_sent_discarded_total Number of discarded BGP messages to peer +# TYPE bgp_sent_discarded_total counter +bgp_sent_discarded_total{peer="100.109.96.78"} 0 +bgp_sent_discarded_total{peer="100.125.49.61"} 0 +bgp_sent_discarded_total{peer="100.67.112.61"} 0 +bgp_sent_discarded_total{peer="100.75.128.54"} 0 +# HELP bgp_sent_keepalive_total Number of sent BGP KEEPALIVE messages from peer +# TYPE bgp_sent_keepalive_total counter +bgp_sent_keepalive_total{peer="100.109.96.78"} 536 +bgp_sent_keepalive_total{peer="100.125.49.61"} 536 +bgp_sent_keepalive_total{peer="100.67.112.61"} 536 +bgp_sent_keepalive_total{peer="100.75.128.54"} 536 +# HELP bgp_sent_message_total Number of sent BGP messages from peer +# TYPE bgp_sent_message_total counter +bgp_sent_message_total{peer="100.109.96.78"} 545 +bgp_sent_message_total{peer="100.125.49.61"} 545 +bgp_sent_message_total{peer="100.67.112.61"} 545 +bgp_sent_message_total{peer="100.75.128.54"} 546 +... +``` + +## Exported metrics + +The metrics are all prefixed with the `bgp` Prometheus namespace. + +| **Metric** | **Description** | **Labels** | +| ---------------------------------- | ---------------------------------------------------------------------------- |----------------------------------------| +| bgp_peer_state | State of the BGP session with peer and its administrative state | `peer`, `session_state`, `admin_state` | +| bgp_peer_asn | What is the AS number of the peer and its router ID | `peer`, `router_id` | +| bgp_peer_local_asn | What is the AS number presented to the peer by this router and its ID | `peer`, `router_id` | +| bgp_peer_flop_count | Number of flops with the peer | `peer` | +| bgp_peer_out_queue_count | Length of the outgoing message queue | `peer` | +| bgp_peer_password_set | Whether the GoBGP peer has been configured (1) for authentication or not (0) | `peer` | +| bgp_peer_remove_private_as | Do we remove private ASNs from the paths sent to the peer | `peer` | +| bgp_peer_send_community | BGP community with the peer | `peer` | +| bgp_peer_type | Type of the BGP peer, internal (0) or external (1) | `peer` | +| bgp_peer_uptime | For how long the peer has been in its current state | `peer` | +| bgp_routes_accepted | Number of routes accepted from peer | `peer`, `route_family` | +| bgp_routes_advertised | Number of routes advertised to peer | `peer`, `route_family` | +| bgp_routes_received | Number of routes received from peer | `peer`, `route_family` | +| bgp_sent_discarded_total | Number of discarded BGP messages to peer | `peer` | +| bgp_sent_keepalive_total | Number of sent BGP KEEPALIVE messages from peer | `peer` | +| bgp_sent_message_total | Number of sent BGP messages from peer | `peer` | +| bgp_sent_notification_total | Number of sent BGP NOTIFICATION messages from peer | `peer` | +| bgp_sent_open_total | Number of sent BGP OPEN messages from peer | `peer` | +| bgp_sent_refresh_total | Number of sent BGP REFRESH messages from peer | `peer` | +| bgp_sent_update_total | Number of sent BGP UPDATE messages from peer | `peer` | +| bgp_sent_withdraw_prefix_total | Number of sent BGP WITHDRAW-PREFIX messages from peer | `peer` | +| bgp_sent_withdraw_update_total | Number of sent BGP WITHDRAW-UPDATE messages from peer | `peer` | +| bgp_received_discarded_total | Number of discarded BGP messages from peer | `peer` | +| bgp_received_keepalive_total | Number of received BGP KEEPALIVE messages from peer | `peer` | +| bgp_received_message_total | Number of received BGP messages from peer | `peer` | +| bgp_received_notification_total | Number of received BGP NOTIFICATION messages from peer | `peer` | +| bgp_received_open_total | Number of received BGP OPEN messages from peer | `peer` | +| bgp_received_refresh_total | Number of received BGP REFRESH messages from peer | `peer` | +| bgp_received_update_total | Number of received BGP UPDATE messages from peer | `peer` | +| bgp_received_withdraw_prefix_total | Number of received BGP WITHDRAW-PREFIX messages from peer | `peer` | +| bgp_received_withdraw_update_total | Number of received BGP WITHDRAW-UPDATE messages from peer | `peer` | + +## Label values + +Some labels can have specific values depending on the state of GoBGP or of the peers: + +- `peer`: the IP of the remote BGP peer +- `session_state`: the BGP FSM status of the peer, can be either `UNKNOWN`, `IDLE`, `CONNECT`, `IDLE`, `ACTIVE`, `OPENSENT`, `OPENCONFIRM` or `ESTABLISHED` +- `admin_state`: administrative state of the peer, can be either `DOWN`, `UP` or `PFX_CNT` if prefix limit is reached +- `route_family`: any address family supported by GoBGP (e.g `ipv4`, `ipv6`, `evpn`) + +## Grafana dashboard + +There is an example Grafana panel to display some metrics about GoBGP available [here](https://grafana.com/grafana/dashboards/22061-gobgp/) diff --git a/internal/pkg/metrics/metrics.go b/internal/pkg/metrics/metrics.go index efcfbb07f..da302237b 100644 --- a/internal/pkg/metrics/metrics.go +++ b/internal/pkg/metrics/metrics.go @@ -14,45 +14,173 @@ type bgpCollector struct { server *server.BgpServer } +const ( + // Global namespace of the metrics + namespace = "bgp" +) + var ( - peerLabels = []string{"peer"} - peerStateLabels = []string{"peer", "session_state", "admin_state"} - rfLabels = []string{"peer", "route_family"} - - bgpReceivedUpdateTotalDesc = prometheus.NewDesc("bgp_received_update_total", "Number of received BGP UPDATE messages from peer", peerLabels, nil) - bgpReceivedNotificationTotalDesc = prometheus.NewDesc("bgp_received_notification_total", "Number of received BGP NOTIFICATION messages from peer", peerLabels, nil) - bgpReceivedOpenTotalDesc = prometheus.NewDesc("bgp_received_open_total", "Number of received BGP OPEN messages from peer", peerLabels, nil) - bgpReceivedRefreshTotalDesc = prometheus.NewDesc("bgp_received_refresh_total", "Number of received BGP REFRESH messages from peer", peerLabels, nil) - bgpReceivedKeepaliveTotalDesc = prometheus.NewDesc("bgp_received_keepalive_total", "Number of received BGP KEEPALIVE messages from peer", peerLabels, nil) - bgpReceivedWithdrawUpdateTotalDesc = prometheus.NewDesc("bgp_received_withdraw_update_total", "Number of received BGP WITHDRAW-UPDATE messages from peer", peerLabels, nil) - bgpReceivedWithdrawPrefixTotalDesc = prometheus.NewDesc("bgp_received_withdraw_prefix_total", "Number of received BGP WITHDRAW-PREFIX messages from peer", peerLabels, nil) - bgpReceivedDiscardedTotalDesc = prometheus.NewDesc("bgp_received_discarded_total", "Number of discarded BGP messages from peer", peerLabels, nil) - bgpReceivedMessageTotalDesc = prometheus.NewDesc("bgp_received_message_total", "Number of received BGP messages from peer", peerLabels, nil) - - bgpSentUpdateTotalDesc = prometheus.NewDesc("bgp_sent_update_total", "Number of sent BGP UPDATE messages from peer", peerLabels, nil) - bgpSentNotificationTotalDesc = prometheus.NewDesc("bgp_sent_notification_total", "Number of sent BGP NOTIFICATION messages from peer", peerLabels, nil) - bgpSentOpenTotalDesc = prometheus.NewDesc("bgp_sent_open_total", "Number of sent BGP OPEN messages from peer", peerLabels, nil) - bgpSentRefreshTotalDesc = prometheus.NewDesc("bgp_sent_refresh_total", "Number of sent BGP REFRESH messages from peer", peerLabels, nil) - bgpSentKeepaliveTotalDesc = prometheus.NewDesc("bgp_sent_keepalive_total", "Number of sent BGP KEEPALIVE messages from peer", peerLabels, nil) - bgpSentWithdrawUpdateTotalDesc = prometheus.NewDesc("bgp_sent_withdraw_update_total", "Number of sent BGP WITHDRAW-UPDATE messages from peer", peerLabels, nil) - bgpSentWithdrawPrefixTotalDesc = prometheus.NewDesc("bgp_sent_withdraw_prefix_total", "Number of sent BGP WITHDRAW-PREFIX messages from peer", peerLabels, nil) - bgpSentDiscardedTotalDesc = prometheus.NewDesc("bgp_sent_discarded_total", "Number of discarded BGP messages from peer", peerLabels, nil) - bgpSentMessageTotalDesc = prometheus.NewDesc("bgp_sent_message_total", "Number of sent BGP messages from peer", peerLabels, nil) - - bgpPeerStateDesc = prometheus.NewDesc("bgp_peer_state", "State of the BGP session with peer", peerStateLabels, nil) + // Labels appended to the metrics + peerLabels = []string{"peer"} + peerRouterIdLabels = []string{"peer", "router_id"} + peerStateLabels = []string{"peer", "session_state", "admin_state"} + rfLabels = []string{"peer", "route_family"} + + bgpReceivedUpdateTotalDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "received", "update_total"), + "Number of received BGP UPDATE messages from peer", + peerLabels, nil, + ) + bgpReceivedNotificationTotalDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "received", "notification_total"), + "Number of received BGP NOTIFICATION messages from peer", + peerLabels, nil, + ) + bgpReceivedOpenTotalDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "received", "open_total"), + "Number of received BGP OPEN messages from peer", + peerLabels, nil, + ) + bgpReceivedRefreshTotalDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "received", "refresh_total"), + "Number of received BGP REFRESH messages from peer", + peerLabels, nil, + ) + bgpReceivedKeepaliveTotalDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "received", "keepalive_total"), + "Number of received BGP KEEPALIVE messages from peer", + peerLabels, nil, + ) + bgpReceivedWithdrawUpdateTotalDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "received", "withdraw_update_total"), + "Number of received BGP WITHDRAW-UPDATE messages from peer", + peerLabels, nil, + ) + bgpReceivedWithdrawPrefixTotalDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "received", "withdraw_prefix_total"), + "Number of received BGP WITHDRAW-PREFIX messages from peer", + peerLabels, nil, + ) + bgpReceivedDiscardedTotalDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "received", "discarded_total"), + "Number of discarded BGP messages from peer", + peerLabels, nil, + ) + bgpReceivedMessageTotalDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "received", "message_total"), + "Number of received BGP messages from peer", + peerLabels, nil, + ) + + bgpSentUpdateTotalDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "sent", "update_total"), + "Number of sent BGP UPDATE messages from peer", + peerLabels, nil, + ) + bgpSentNotificationTotalDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "sent", "notification_total"), + "Number of sent BGP NOTIFICATION messages from peer", + peerLabels, nil, + ) + bgpSentOpenTotalDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "sent", "open_total"), + "Number of sent BGP OPEN messages from peer", + peerLabels, nil, + ) + bgpSentRefreshTotalDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "sent", "refresh_total"), + "Number of sent BGP REFRESH messages from peer", + peerLabels, nil, + ) + bgpSentKeepaliveTotalDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "sent", "keepalive_total"), + "Number of sent BGP KEEPALIVE messages from peer", + peerLabels, nil, + ) + bgpSentWithdrawUpdateTotalDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "sent", "withdraw_update_total"), + "Number of sent BGP WITHDRAW-UPDATE messages from peer", + peerLabels, nil, + ) + bgpSentWithdrawPrefixTotalDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "sent", "withdraw_prefix_total"), + "Number of sent BGP WITHDRAW-PREFIX messages from peer", + peerLabels, nil, + ) + bgpSentDiscardedTotalDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "sent", "discarded_total"), + "Number of discarded BGP messages to peer", peerLabels, + nil, + ) + bgpSentMessageTotalDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "sent", "message_total"), + "Number of sent BGP messages from peer", peerLabels, + nil, + ) + + bgpPeerOutQueueDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "peer", "out_queue_count"), + "Length of the outgoing message queue", + peerLabels, nil, + ) + bgpPeerFlopsDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "peer", "flop_count"), + "Number of flops with the peer", + peerLabels, nil, + ) + bgpPeerUptimeDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "peer", "uptime"), + "For how long the peer has been in its current state", + peerLabels, nil, + ) + bgpPeerSendCommunityFlagDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "peer", "send_community"), + "BGP community with the peer", + peerLabels, nil, + ) + bgpPeerRemovePrivateAsFlagDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "peer", "remove_private_as"), + "Do we remove private ASNs from the paths sent to the peer", + peerLabels, nil, + ) + bgpPeerPasswordSetFlagDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "peer", "password_set"), + "Whether the GoBGP peer has been configured (1) for authentication or not (0)", + peerLabels, nil, + ) + bgpPeerTypeDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "peer", "type"), + "Type of the BGP peer, internal (0) or external (1)", + peerLabels, nil, + ) + bgpPeerAsnDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "peer", "asn"), + "What is the AS number of the peer", + peerRouterIdLabels, nil, + ) + bgpPeerLocalAsnDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "peer", "local_asn"), + "What is the AS number presented to the peer by this router", + peerRouterIdLabels, nil, + ) + bgpPeerStateDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "peer", "state"), + "State of the BGP session with peer and its administrative state", + peerStateLabels, nil, + ) bgpRoutesReceivedDesc = prometheus.NewDesc( - "bgp_routes_received", + prometheus.BuildFQName(namespace, "routes", "received"), "Number of routes received from peer", rfLabels, nil, ) bgpRoutesAcceptedDesc = prometheus.NewDesc( - "bgp_routes_accepted", + prometheus.BuildFQName(namespace, "routes", "accepted"), "Number of routes accepted from peer", rfLabels, nil, ) bgpRoutesAdvertisedDesc = prometheus.NewDesc( - "bgp_routes_advertised", + prometheus.BuildFQName(namespace, "routes", "advertised"), "Number of routes advertised to peer", rfLabels, nil, ) @@ -83,6 +211,15 @@ func (c *bgpCollector) Describe(out chan<- *prometheus.Desc) { out <- bgpSentDiscardedTotalDesc out <- bgpSentMessageTotalDesc + out <- bgpPeerOutQueueDesc + out <- bgpPeerFlopsDesc + out <- bgpPeerUptimeDesc + out <- bgpPeerSendCommunityFlagDesc + out <- bgpPeerRemovePrivateAsFlagDesc + out <- bgpPeerPasswordSetFlagDesc + out <- bgpPeerTypeDesc + out <- bgpPeerAsnDesc + out <- bgpPeerLocalAsnDesc out <- bgpPeerStateDesc out <- bgpRoutesReceivedDesc @@ -95,12 +232,14 @@ func (c *bgpCollector) Collect(out chan<- prometheus.Metric) { err := c.server.ListPeer(context.Background(), req, func(p *api.Peer) { peerState := p.GetState() peerAddr := peerState.GetNeighborAddress() + peerTimers := p.GetTimers() msg := peerState.GetMessages() send := func(desc *prometheus.Desc, cnt uint64) { out <- prometheus.MustNewConstMetric(desc, prometheus.CounterValue, float64(cnt), peerAddr) } + // Statistics about BGP announcements we've received from our peers send(bgpReceivedUpdateTotalDesc, msg.Received.Update) send(bgpReceivedNotificationTotalDesc, msg.Received.Notification) send(bgpReceivedOpenTotalDesc, msg.Received.Open) @@ -111,16 +250,56 @@ func (c *bgpCollector) Collect(out chan<- prometheus.Metric) { send(bgpReceivedDiscardedTotalDesc, msg.Received.Discarded) send(bgpReceivedMessageTotalDesc, msg.Received.Total) + // Statistics about BGP announcements we've sent to our peers send(bgpSentUpdateTotalDesc, msg.Sent.Update) send(bgpSentNotificationTotalDesc, msg.Sent.Notification) send(bgpSentOpenTotalDesc, msg.Sent.Open) send(bgpSentRefreshTotalDesc, msg.Sent.Refresh) send(bgpSentKeepaliveTotalDesc, msg.Sent.Keepalive) - send(bgpSentWithdrawUpdateTotalDesc, uint64(msg.Sent.WithdrawUpdate)) - send(bgpSentWithdrawPrefixTotalDesc, uint64(msg.Sent.WithdrawPrefix)) + send(bgpSentWithdrawUpdateTotalDesc, msg.Sent.WithdrawUpdate) + send(bgpSentWithdrawPrefixTotalDesc, msg.Sent.WithdrawPrefix) send(bgpSentDiscardedTotalDesc, msg.Sent.Discarded) send(bgpSentMessageTotalDesc, msg.Sent.Total) + // The outbound queue message size + send(bgpPeerOutQueueDesc, uint64(peerState.GetOutQ())) + // The number of neighbor flops + send(bgpPeerFlopsDesc, uint64(peerState.GetFlops())) + // Uptime in seconds of the session + send(bgpPeerUptimeDesc, uint64(peerTimers.GetState().GetUptime().GetSeconds())) + // Whether BGP community is being sent + send(bgpPeerSendCommunityFlagDesc, uint64(peerState.GetSendCommunity())) + // Whether BGP Private AS is being removed (1) or not (0) + send(bgpPeerRemovePrivateAsFlagDesc, uint64(peerState.GetRemovePrivate())) + // Peer Type (0) for internal, (1) for external + send(bgpPeerTypeDesc, uint64(peerState.GetType())) + + // Whether authentication password is being set (1) or not (0) + passwordSetFlag := 0 + if peerState.GetAuthPassword() != "" { + passwordSetFlag = 1 + } + send(bgpPeerPasswordSetFlagDesc, uint64(passwordSetFlag)) + + // Remote peer router ID and ASN + out <- prometheus.MustNewConstMetric( + bgpPeerAsnDesc, + prometheus.GaugeValue, + float64(peerState.GetPeerAsn()), + peerAddr, + peerState.GetRouterId(), + ) + + // Local router ID and ASN advertised to peer + out <- prometheus.MustNewConstMetric( + bgpPeerLocalAsnDesc, + prometheus.GaugeValue, + float64(peerState.GetLocalAsn()), + peerAddr, + p.Transport.GetLocalAddress(), + ) + + // Session and administrative state of the peer out <- prometheus.MustNewConstMetric( bgpPeerStateDesc, prometheus.GaugeValue, diff --git a/pkg/server/server.go b/pkg/server/server.go index 41def6dab..bb82c3b28 100644 --- a/pkg/server/server.go +++ b/pkg/server/server.go @@ -899,7 +899,7 @@ func (s *BgpServer) toConfig(peer *peer, getAdvertised bool) *oc.Neighbor { if state == bgp.BGP_FSM_ESTABLISHED { peer.fsm.lock.RLock() conf.Transport.State.LocalAddress, conf.Transport.State.LocalPort = peer.fsm.LocalHostPort() - if conf.Transport.Config.LocalAddress != netip.IPv4Unspecified().String() { + if conf.Transport.Config.LocalAddress != netip.IPv4Unspecified().String() && conf.Transport.Config.LocalAddress != netip.IPv6Unspecified().String() { conf.Transport.State.LocalAddress = conf.Transport.Config.LocalAddress } _, conf.Transport.State.RemotePort = peer.fsm.RemoteHostPort() @@ -1639,7 +1639,7 @@ func (s *BgpServer) handleFSMMessage(peer *peer, e *fsmMsg) { // exclude zone info ipaddr, _ := net.ResolveIPAddr("ip", laddr) peer.fsm.peerInfo.LocalAddress = ipaddr.IP - if peer.fsm.pConf.Transport.Config.LocalAddress != netip.IPv4Unspecified().String() { + if peer.fsm.pConf.Transport.Config.LocalAddress != netip.IPv4Unspecified().String() && peer.fsm.pConf.Transport.Config.LocalAddress != netip.IPv6Unspecified().String() { peer.fsm.peerInfo.LocalAddress = net.ParseIP(peer.fsm.pConf.Transport.Config.LocalAddress) peer.fsm.pConf.Transport.State.LocalAddress = peer.fsm.pConf.Transport.Config.LocalAddress }