From ecd5c18610c5276e7c6c34d2f317cf774441a81c Mon Sep 17 00:00:00 2001 From: Csaba Kiraly Date: Mon, 14 Apr 2025 10:13:45 +0200 Subject: [PATCH] p2p: better dial/serve success metrics (#31629) Our previous success metrics gave success even if a peer disconnected right after connection. These metrics only count peers that stayed connected for at least 1 min. The 1 min limit is an arbitrary choice. We do not use this for decision logic, only statistics. --- p2p/metrics.go | 4 ++++ p2p/peer.go | 8 ++++++++ 2 files changed, 12 insertions(+) diff --git a/p2p/metrics.go b/p2p/metrics.go index 1fd0f26db3..8c9804206b 100644 --- a/p2p/metrics.go +++ b/p2p/metrics.go @@ -51,6 +51,10 @@ var ( dialSuccessMeter = metrics.NewRegisteredMeter("p2p/dials/success", nil) dialConnectionError = metrics.NewRegisteredMeter("p2p/dials/error/connection", nil) + // count peers that stayed connected for at least 1 min + serve1MinSuccessMeter = metrics.NewRegisteredMeter("p2p/serves/success/1min", nil) + dial1MinSuccessMeter = metrics.NewRegisteredMeter("p2p/dials/success/1min", nil) + // handshake error meters dialTooManyPeers = metrics.NewRegisteredMeter("p2p/dials/error/saturated", nil) dialAlreadyConnected = metrics.NewRegisteredMeter("p2p/dials/error/known", nil) diff --git a/p2p/peer.go b/p2p/peer.go index a01df63d0c..9ffb94e5a8 100644 --- a/p2p/peer.go +++ b/p2p/peer.go @@ -254,6 +254,8 @@ func (p *Peer) run() (remoteRequested bool, err error) { p.wg.Add(2) go p.readLoop(readErr) go p.pingLoop() + live1min := time.NewTimer(1 * time.Minute) + defer live1min.Stop() // Start all protocol handlers. writeStart <- struct{}{} @@ -285,6 +287,12 @@ loop: case err = <-p.disc: reason = discReasonForError(err) break loop + case <-live1min.C: + if p.Inbound() { + serve1MinSuccessMeter.Mark(1) + } else { + dial1MinSuccessMeter.Mark(1) + } } }