1
0
Fork 0
forked from forks/go-ethereum

p2p: fix dial metrics not picking up the right error (#31621)

Our metrics related to dial errors were off. The original error was not
wrapped, so the caller function had no chance of picking it up.
Therefore the most common error, which is "TooManyPeers", was not
correctly counted.

The metrics were originally introduced in
https://github.com/ethereum/go-ethereum/pull/27621

I was thinking of various possible solutions.
- the one proposed here wraps both the new error and the origial error.
It is not a pattern we use in other parts of the code, but works. This
is maybe the smallest possible change.
- as an alternate, I could write a proper `errProtoHandshakeError` with
it's own wrapped error
- finally, I'm not even sure we need `errProtoHandshakeError`, maybe we
could just pass up the original error.

---------

Signed-off-by: Csaba Kiraly <csaba.kiraly@gmail.com>
Co-authored-by: Felix Lange <fjl@twurst.com>
This commit is contained in:
Csaba Kiraly 2025-04-15 20:40:30 +02:00 committed by GitHub
parent 476f117211
commit 6928ec5d92
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 34 additions and 23 deletions

View file

@ -49,7 +49,7 @@ var (
serveSuccessMeter = metrics.NewRegisteredMeter("p2p/serves/success", nil)
dialMeter = metrics.NewRegisteredMeter("p2p/dials", nil)
dialSuccessMeter = metrics.NewRegisteredMeter("p2p/dials/success", nil)
dialConnectionError = metrics.NewRegisteredMeter("p2p/dials/error/connection", nil)
dialConnectionError = metrics.NewRegisteredMeter("p2p/dials/error/connection", nil) // dial timeout; no route to host; connection refused; network is unreachable
// count peers that stayed connected for at least 1 min
serve1MinSuccessMeter = metrics.NewRegisteredMeter("p2p/serves/success/1min", nil)
@ -61,34 +61,41 @@ var (
dialSelf = metrics.NewRegisteredMeter("p2p/dials/error/self", nil)
dialUselessPeer = metrics.NewRegisteredMeter("p2p/dials/error/useless", nil)
dialUnexpectedIdentity = metrics.NewRegisteredMeter("p2p/dials/error/id/unexpected", nil)
dialEncHandshakeError = metrics.NewRegisteredMeter("p2p/dials/error/rlpx/enc", nil)
dialProtoHandshakeError = metrics.NewRegisteredMeter("p2p/dials/error/rlpx/proto", nil)
dialEncHandshakeError = metrics.NewRegisteredMeter("p2p/dials/error/rlpx/enc", nil) // EOF; connection reset during handshake; message too big; i/o timeout
dialProtoHandshakeError = metrics.NewRegisteredMeter("p2p/dials/error/rlpx/proto", nil) // EOF
// capture the rest of errors that are not handled by the above meters
dialOtherError = metrics.NewRegisteredMeter("p2p/dials/error/other", nil)
)
// markDialError matches errors that occur while setting up a dial connection
// to the corresponding meter.
// markDialError matches errors that occur while setting up a dial connection to the
// corresponding meter. We don't maintain meters for evert possible error, just for
// the most interesting ones.
func markDialError(err error) {
if !metrics.Enabled() {
return
}
if err2 := errors.Unwrap(err); err2 != nil {
err = err2
}
switch err {
case DiscTooManyPeers:
var reason DiscReason
var handshakeErr *protoHandshakeError
d := errors.As(err, &reason)
switch {
case d && reason == DiscTooManyPeers:
dialTooManyPeers.Mark(1)
case DiscAlreadyConnected:
case d && reason == DiscAlreadyConnected:
dialAlreadyConnected.Mark(1)
case DiscSelf:
case d && reason == DiscSelf:
dialSelf.Mark(1)
case DiscUselessPeer:
case d && reason == DiscUselessPeer:
dialUselessPeer.Mark(1)
case DiscUnexpectedIdentity:
case d && reason == DiscUnexpectedIdentity:
dialUnexpectedIdentity.Mark(1)
case errEncHandshakeError:
dialEncHandshakeError.Mark(1)
case errProtoHandshakeError:
case errors.As(err, &handshakeErr):
dialProtoHandshakeError.Mark(1)
case errors.Is(err, errEncHandshakeError):
dialEncHandshakeError.Mark(1)
default:
dialOtherError.Mark(1)
}
}

View file

@ -66,11 +66,15 @@ const (
)
var (
errServerStopped = errors.New("server stopped")
errEncHandshakeError = errors.New("rlpx enc error")
errProtoHandshakeError = errors.New("rlpx proto error")
errServerStopped = errors.New("server stopped")
errEncHandshakeError = errors.New("rlpx enc error")
)
type protoHandshakeError struct{ err error }
func (e *protoHandshakeError) Error() string { return fmt.Sprintf("rlpx proto error: %v", e.err) }
func (e *protoHandshakeError) Unwrap() error { return e.err }
// Server manages all peer connections.
type Server struct {
// Config fields may not be modified while the server is running.
@ -907,7 +911,7 @@ func (srv *Server) setupConn(c *conn, dialDest *enode.Node) error {
phs, err := c.doProtoHandshake(srv.ourHandshake)
if err != nil {
clog.Trace("Failed p2p handshake", "err", err)
return fmt.Errorf("%w: %v", errProtoHandshakeError, err)
return &protoHandshakeError{err: err}
}
if id := c.node.ID(); !bytes.Equal(crypto.Keccak256(phs.ID), id[:]) {
clog.Trace("Wrong devp2p handshake identity", "phsid", hex.EncodeToString(phs.ID))

View file

@ -410,11 +410,11 @@ func TestServerSetupConn(t *testing.T) {
wantCloseErr: DiscUnexpectedIdentity,
},
{
tt: &setupTransport{pubkey: clientpub, protoHandshakeErr: errProtoHandshakeError},
tt: &setupTransport{pubkey: clientpub, protoHandshakeErr: DiscTooManyPeers},
dialDest: enode.NewV4(clientpub, nil, 0, 0),
flags: dynDialedConn,
wantCalls: "doEncHandshake,doProtoHandshake,close,",
wantCloseErr: errProtoHandshakeError,
wantCloseErr: DiscTooManyPeers,
},
{
tt: &setupTransport{pubkey: srvpub, phs: protoHandshake{ID: crypto.FromECDSAPub(srvpub)[1:]}},