beacon/light: keep retrying checkpoint init if failed (#33966)
Some checks are pending
/ Linux Build (arm) (push) Waiting to run
/ Keeper Build (push) Waiting to run
/ Windows Build (push) Waiting to run
/ Linux Build (push) Waiting to run
/ Docker Image (push) Waiting to run

This PR changes the blsync checkpoint init logic so that even if the
initialization fails with a certain server and an error log message is
printed, the server goes back to its initial state and is allowed to
retry initialization after the failure delay period. The previous logic
had an `ssDone` server state that did put the server in a permanently
unusable state once the checkpoint init failed for an apparently
permanent reason. This was not the correct behavior because different
servers behave differently in case of overload and sometimes the
response to a permanently missing item is not clearly distinguishable
from an overload response. A safer logic is to never assume anything to
be permanent and always give a chance to retry.
The failure delay formula is also fixed; now it is properly capped at
`maxFailureDelay`. The previous formula did allow the delay to grow
unlimited if a retry was attempted immediately after each delay period.
This commit is contained in:
Felföldi Zsolt 2026-04-01 16:05:57 +02:00 committed by GitHub
parent 92b4cb2663
commit fc43170cdd
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 3 additions and 6 deletions

View file

@ -438,14 +438,11 @@ func (s *serverWithLimits) fail(desc string) {
// failLocked calculates the dynamic failure delay and applies it.
func (s *serverWithLimits) failLocked(desc string) {
log.Debug("Server error", "description", desc)
s.failureDelay *= 2
now := s.clock.Now()
if now > s.failureDelayEnd {
s.failureDelay *= math.Pow(2, -float64(now-s.failureDelayEnd)/float64(maxFailureDelay))
}
if s.failureDelay < float64(minFailureDelay) {
s.failureDelay = float64(minFailureDelay)
}
s.failureDelay = max(min(s.failureDelay*2, float64(maxFailureDelay)), float64(minFailureDelay))
s.failureDelayEnd = now + mclock.AbsTime(s.failureDelay)
s.delay(time.Duration(s.failureDelay))
}

View file

@ -62,7 +62,6 @@ const (
ssNeedParent // cp header slot %32 != 0, need parent to check epoch boundary
ssParentRequested // cp parent header requested
ssPrintStatus // has all necessary info, print log message if init still not successful
ssDone // log message printed, no more action required
)
type serverState struct {
@ -180,7 +179,8 @@ func (s *CheckpointInit) Process(requester request.Requester, events []request.E
default:
log.Error("blsync: checkpoint not available, but reported as finalized; specified checkpoint hash might be too old", "server", server.Name())
}
s.serverState[server] = serverState{state: ssDone}
s.serverState[server] = serverState{state: ssDefault}
requester.Fail(server, "checkpoint init failed")
}
}