From 6928ec5d924604cc403a74ef3bbca29505443ab6 Mon Sep 17 00:00:00 2001 From: Csaba Kiraly Date: Tue, 15 Apr 2025 20:40:30 +0200 Subject: [PATCH] p2p: fix dial metrics not picking up the right error (#31621) Our metrics related to dial errors were off. The original error was not wrapped, so the caller function had no chance of picking it up. Therefore the most common error, which is "TooManyPeers", was not correctly counted. The metrics were originally introduced in https://github.com/ethereum/go-ethereum/pull/27621 I was thinking of various possible solutions. - the one proposed here wraps both the new error and the origial error. It is not a pattern we use in other parts of the code, but works. This is maybe the smallest possible change. - as an alternate, I could write a proper `errProtoHandshakeError` with it's own wrapped error - finally, I'm not even sure we need `errProtoHandshakeError`, maybe we could just pass up the original error. --------- Signed-off-by: Csaba Kiraly Co-authored-by: Felix Lange --- p2p/metrics.go | 41 ++++++++++++++++++++++++----------------- p2p/server.go | 12 ++++++++---- p2p/server_test.go | 4 ++-- 3 files changed, 34 insertions(+), 23 deletions(-) diff --git a/p2p/metrics.go b/p2p/metrics.go index 8c9804206b..29c2acb0cb 100644 --- a/p2p/metrics.go +++ b/p2p/metrics.go @@ -49,7 +49,7 @@ var ( serveSuccessMeter = metrics.NewRegisteredMeter("p2p/serves/success", nil) dialMeter = metrics.NewRegisteredMeter("p2p/dials", nil) dialSuccessMeter = metrics.NewRegisteredMeter("p2p/dials/success", nil) - dialConnectionError = metrics.NewRegisteredMeter("p2p/dials/error/connection", nil) + dialConnectionError = metrics.NewRegisteredMeter("p2p/dials/error/connection", nil) // dial timeout; no route to host; connection refused; network is unreachable // count peers that stayed connected for at least 1 min serve1MinSuccessMeter = metrics.NewRegisteredMeter("p2p/serves/success/1min", nil) @@ -61,34 +61,41 @@ var ( dialSelf = metrics.NewRegisteredMeter("p2p/dials/error/self", nil) dialUselessPeer = metrics.NewRegisteredMeter("p2p/dials/error/useless", nil) dialUnexpectedIdentity = metrics.NewRegisteredMeter("p2p/dials/error/id/unexpected", nil) - dialEncHandshakeError = metrics.NewRegisteredMeter("p2p/dials/error/rlpx/enc", nil) - dialProtoHandshakeError = metrics.NewRegisteredMeter("p2p/dials/error/rlpx/proto", nil) + dialEncHandshakeError = metrics.NewRegisteredMeter("p2p/dials/error/rlpx/enc", nil) // EOF; connection reset during handshake; message too big; i/o timeout + dialProtoHandshakeError = metrics.NewRegisteredMeter("p2p/dials/error/rlpx/proto", nil) // EOF + + // capture the rest of errors that are not handled by the above meters + dialOtherError = metrics.NewRegisteredMeter("p2p/dials/error/other", nil) ) -// markDialError matches errors that occur while setting up a dial connection -// to the corresponding meter. +// markDialError matches errors that occur while setting up a dial connection to the +// corresponding meter. We don't maintain meters for evert possible error, just for +// the most interesting ones. func markDialError(err error) { if !metrics.Enabled() { return } - if err2 := errors.Unwrap(err); err2 != nil { - err = err2 - } - switch err { - case DiscTooManyPeers: + + var reason DiscReason + var handshakeErr *protoHandshakeError + d := errors.As(err, &reason) + switch { + case d && reason == DiscTooManyPeers: dialTooManyPeers.Mark(1) - case DiscAlreadyConnected: + case d && reason == DiscAlreadyConnected: dialAlreadyConnected.Mark(1) - case DiscSelf: + case d && reason == DiscSelf: dialSelf.Mark(1) - case DiscUselessPeer: + case d && reason == DiscUselessPeer: dialUselessPeer.Mark(1) - case DiscUnexpectedIdentity: + case d && reason == DiscUnexpectedIdentity: dialUnexpectedIdentity.Mark(1) - case errEncHandshakeError: - dialEncHandshakeError.Mark(1) - case errProtoHandshakeError: + case errors.As(err, &handshakeErr): dialProtoHandshakeError.Mark(1) + case errors.Is(err, errEncHandshakeError): + dialEncHandshakeError.Mark(1) + default: + dialOtherError.Mark(1) } } diff --git a/p2p/server.go b/p2p/server.go index 4e72e29fa0..d9105976dd 100644 --- a/p2p/server.go +++ b/p2p/server.go @@ -66,11 +66,15 @@ const ( ) var ( - errServerStopped = errors.New("server stopped") - errEncHandshakeError = errors.New("rlpx enc error") - errProtoHandshakeError = errors.New("rlpx proto error") + errServerStopped = errors.New("server stopped") + errEncHandshakeError = errors.New("rlpx enc error") ) +type protoHandshakeError struct{ err error } + +func (e *protoHandshakeError) Error() string { return fmt.Sprintf("rlpx proto error: %v", e.err) } +func (e *protoHandshakeError) Unwrap() error { return e.err } + // Server manages all peer connections. type Server struct { // Config fields may not be modified while the server is running. @@ -907,7 +911,7 @@ func (srv *Server) setupConn(c *conn, dialDest *enode.Node) error { phs, err := c.doProtoHandshake(srv.ourHandshake) if err != nil { clog.Trace("Failed p2p handshake", "err", err) - return fmt.Errorf("%w: %v", errProtoHandshakeError, err) + return &protoHandshakeError{err: err} } if id := c.node.ID(); !bytes.Equal(crypto.Keccak256(phs.ID), id[:]) { clog.Trace("Wrong devp2p handshake identity", "phsid", hex.EncodeToString(phs.ID)) diff --git a/p2p/server_test.go b/p2p/server_test.go index a0491e984a..d42926cf4c 100644 --- a/p2p/server_test.go +++ b/p2p/server_test.go @@ -410,11 +410,11 @@ func TestServerSetupConn(t *testing.T) { wantCloseErr: DiscUnexpectedIdentity, }, { - tt: &setupTransport{pubkey: clientpub, protoHandshakeErr: errProtoHandshakeError}, + tt: &setupTransport{pubkey: clientpub, protoHandshakeErr: DiscTooManyPeers}, dialDest: enode.NewV4(clientpub, nil, 0, 0), flags: dynDialedConn, wantCalls: "doEncHandshake,doProtoHandshake,close,", - wantCloseErr: errProtoHandshakeError, + wantCloseErr: DiscTooManyPeers, }, { tt: &setupTransport{pubkey: srvpub, phs: protoHandshake{ID: crypto.FromECDSAPub(srvpub)[1:]}},