From fc43170cdd2a72bfb1e99e72f4e836a2e8036501 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felf=C3=B6ldi=20Zsolt?= Date: Wed, 1 Apr 2026 16:05:57 +0200 Subject: [PATCH] beacon/light: keep retrying checkpoint init if failed (#33966) This PR changes the blsync checkpoint init logic so that even if the initialization fails with a certain server and an error log message is printed, the server goes back to its initial state and is allowed to retry initialization after the failure delay period. The previous logic had an `ssDone` server state that did put the server in a permanently unusable state once the checkpoint init failed for an apparently permanent reason. This was not the correct behavior because different servers behave differently in case of overload and sometimes the response to a permanently missing item is not clearly distinguishable from an overload response. A safer logic is to never assume anything to be permanent and always give a chance to retry. The failure delay formula is also fixed; now it is properly capped at `maxFailureDelay`. The previous formula did allow the delay to grow unlimited if a retry was attempted immediately after each delay period. --- beacon/light/request/server.go | 5 +---- beacon/light/sync/update_sync.go | 4 ++-- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/beacon/light/request/server.go b/beacon/light/request/server.go index a06dec99ae..d39570b8e5 100644 --- a/beacon/light/request/server.go +++ b/beacon/light/request/server.go @@ -438,14 +438,11 @@ func (s *serverWithLimits) fail(desc string) { // failLocked calculates the dynamic failure delay and applies it. func (s *serverWithLimits) failLocked(desc string) { log.Debug("Server error", "description", desc) - s.failureDelay *= 2 now := s.clock.Now() if now > s.failureDelayEnd { s.failureDelay *= math.Pow(2, -float64(now-s.failureDelayEnd)/float64(maxFailureDelay)) } - if s.failureDelay < float64(minFailureDelay) { - s.failureDelay = float64(minFailureDelay) - } + s.failureDelay = max(min(s.failureDelay*2, float64(maxFailureDelay)), float64(minFailureDelay)) s.failureDelayEnd = now + mclock.AbsTime(s.failureDelay) s.delay(time.Duration(s.failureDelay)) } diff --git a/beacon/light/sync/update_sync.go b/beacon/light/sync/update_sync.go index 9549ee5992..d84a3d64da 100644 --- a/beacon/light/sync/update_sync.go +++ b/beacon/light/sync/update_sync.go @@ -62,7 +62,6 @@ const ( ssNeedParent // cp header slot %32 != 0, need parent to check epoch boundary ssParentRequested // cp parent header requested ssPrintStatus // has all necessary info, print log message if init still not successful - ssDone // log message printed, no more action required ) type serverState struct { @@ -180,7 +179,8 @@ func (s *CheckpointInit) Process(requester request.Requester, events []request.E default: log.Error("blsync: checkpoint not available, but reported as finalized; specified checkpoint hash might be too old", "server", server.Name()) } - s.serverState[server] = serverState{state: ssDone} + s.serverState[server] = serverState{state: ssDefault} + requester.Fail(server, "checkpoint init failed") } }