go-ethereum/beacon/light/sync/update_sync.go
Felföldi Zsolt fc43170cdd
Some checks are pending
/ Linux Build (arm) (push) Waiting to run
/ Keeper Build (push) Waiting to run
/ Windows Build (push) Waiting to run
/ Linux Build (push) Waiting to run
/ Docker Image (push) Waiting to run
beacon/light: keep retrying checkpoint init if failed (#33966)
This PR changes the blsync checkpoint init logic so that even if the
initialization fails with a certain server and an error log message is
printed, the server goes back to its initial state and is allowed to
retry initialization after the failure delay period. The previous logic
had an `ssDone` server state that did put the server in a permanently
unusable state once the checkpoint init failed for an apparently
permanent reason. This was not the correct behavior because different
servers behave differently in case of overload and sometimes the
response to a permanently missing item is not clearly distinguishable
from an overload response. A safer logic is to never assume anything to
be permanent and always give a chance to retry.
The failure delay formula is also fixed; now it is properly capped at
`maxFailureDelay`. The previous formula did allow the delay to grow
unlimited if a retry was attempted immediately after each delay period.
2026-04-01 16:05:57 +02:00

398 lines
13 KiB
Go

// Copyright 2023 The go-ethereum Authors
// This file is part of the go-ethereum library.
//
// The go-ethereum library is free software: you can redistribute it and/or modify
// it under the terms of the GNU Lesser General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// The go-ethereum library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
package sync
import (
"sort"
"github.com/ethereum/go-ethereum/beacon/light"
"github.com/ethereum/go-ethereum/beacon/light/request"
"github.com/ethereum/go-ethereum/beacon/params"
"github.com/ethereum/go-ethereum/beacon/types"
"github.com/ethereum/go-ethereum/common"
"github.com/ethereum/go-ethereum/log"
)
const maxUpdateRequest = 8 // maximum number of updates requested in a single request
type committeeChain interface {
CheckpointInit(bootstrap types.BootstrapData) error
InsertUpdate(update *types.LightClientUpdate, nextCommittee *types.SerializedSyncCommittee) error
NextSyncPeriod() (uint64, bool)
}
// CheckpointInit implements request.Module; it fetches the light client bootstrap
// data belonging to the given checkpoint hash and initializes the committee chain
// if successful.
type CheckpointInit struct {
chain committeeChain
checkpointHash common.Hash
locked request.ServerAndID
initialized bool
// per-server state is used to track the state of requesting checkpoint header
// info. Part of this info (canonical and finalized state) is not validated
// and therefore it is requested from each server separately after it has
// reported a missing checkpoint (which is also not validated info).
serverState map[request.Server]serverState
// the following fields are used to determine whether the checkpoint is on
// epoch boundary. This information is validated and therefore stored globally.
parentHash common.Hash
hasEpochInfo, epochBoundary bool
cpSlot, parentSlot uint64
}
const (
ssDefault = iota // no action yet or checkpoint requested
ssNeedHeader // checkpoint req failed, need cp header
ssHeaderRequested // cp header requested
ssNeedParent // cp header slot %32 != 0, need parent to check epoch boundary
ssParentRequested // cp parent header requested
ssPrintStatus // has all necessary info, print log message if init still not successful
)
type serverState struct {
state int
hasHeader, canonical, finalized bool // stored per server because not validated
}
// NewCheckpointInit creates a new CheckpointInit.
func NewCheckpointInit(chain committeeChain, checkpointHash common.Hash) *CheckpointInit {
return &CheckpointInit{
chain: chain,
checkpointHash: checkpointHash,
serverState: make(map[request.Server]serverState),
}
}
// Process implements request.Module.
func (s *CheckpointInit) Process(requester request.Requester, events []request.Event) {
if s.initialized {
return
}
for _, event := range events {
switch event.Type {
case request.EvResponse, request.EvFail, request.EvTimeout:
sid, req, resp := event.RequestInfo()
if s.locked == sid {
s.locked = request.ServerAndID{}
}
if event.Type == request.EvTimeout {
continue
}
switch s.serverState[sid.Server].state {
case ssDefault:
if resp != nil {
if checkpoint := resp.(*types.BootstrapData); checkpoint.Header.Hash() == common.Hash(req.(ReqCheckpointData)) {
s.chain.CheckpointInit(*checkpoint)
s.initialized = true
return
}
requester.Fail(event.Server, "invalid checkpoint data")
}
s.serverState[sid.Server] = serverState{state: ssNeedHeader}
case ssHeaderRequested:
if resp == nil {
s.serverState[sid.Server] = serverState{state: ssPrintStatus}
continue
}
newState := serverState{
hasHeader: true,
canonical: resp.(RespHeader).Canonical,
finalized: resp.(RespHeader).Finalized,
}
s.cpSlot, s.parentHash = resp.(RespHeader).Header.Slot, resp.(RespHeader).Header.ParentRoot
if s.cpSlot%params.EpochLength == 0 {
s.hasEpochInfo, s.epochBoundary = true, true
}
if s.hasEpochInfo {
newState.state = ssPrintStatus
} else {
newState.state = ssNeedParent
}
s.serverState[sid.Server] = newState
case ssParentRequested:
s.parentSlot = resp.(RespHeader).Header.Slot
s.hasEpochInfo, s.epochBoundary = true, s.cpSlot/params.EpochLength > s.parentSlot/params.EpochLength
newState := s.serverState[sid.Server]
newState.state = ssPrintStatus
s.serverState[sid.Server] = newState
}
case request.EvUnregistered:
delete(s.serverState, event.Server)
}
}
// start a request if possible
for _, server := range requester.CanSendTo() {
switch s.serverState[server].state {
case ssDefault:
if s.locked == (request.ServerAndID{}) {
id := requester.Send(server, ReqCheckpointData(s.checkpointHash))
s.locked = request.ServerAndID{Server: server, ID: id}
}
case ssNeedHeader:
requester.Send(server, ReqHeader(s.checkpointHash))
newState := s.serverState[server]
newState.state = ssHeaderRequested
s.serverState[server] = newState
case ssNeedParent:
requester.Send(server, ReqHeader(s.parentHash))
newState := s.serverState[server]
newState.state = ssParentRequested
s.serverState[server] = newState
}
}
// print log message if necessary
for server, state := range s.serverState {
if state.state != ssPrintStatus {
continue
}
switch {
case !state.hasHeader:
log.Error("blsync: checkpoint block is not available, reported as unknown", "server", server.Name())
case !state.canonical:
log.Error("blsync: checkpoint block is not available, reported as non-canonical", "server", server.Name())
case !s.hasEpochInfo:
// should be available if hasHeader is true and state is ssPrintStatus
panic("checkpoint epoch info not available when printing retrieval status")
case !s.epochBoundary:
log.Error("blsync: checkpoint block is not first of epoch", "slot", s.cpSlot, "parent", s.parentSlot, "server", server.Name())
case !state.finalized:
log.Error("blsync: checkpoint block is reported as non-finalized", "server", server.Name())
default:
log.Error("blsync: checkpoint not available, but reported as finalized; specified checkpoint hash might be too old", "server", server.Name())
}
s.serverState[server] = serverState{state: ssDefault}
requester.Fail(server, "checkpoint init failed")
}
}
// ForwardUpdateSync implements request.Module; it fetches updates between the
// committee chain head and each server's announced head. Updates are fetched
// in batches and multiple batches can also be requested in parallel.
// Out of order responses are also handled; if a batch of updates cannot be added
// to the chain immediately because of a gap then the future updates are
// remembered until they can be processed.
type ForwardUpdateSync struct {
chain committeeChain
rangeLock rangeLock
lockedIDs map[request.ServerAndID]struct{}
processQueue []updateResponse
nextSyncPeriod map[request.Server]uint64
}
// NewForwardUpdateSync creates a new ForwardUpdateSync.
func NewForwardUpdateSync(chain committeeChain) *ForwardUpdateSync {
return &ForwardUpdateSync{
chain: chain,
rangeLock: make(rangeLock),
lockedIDs: make(map[request.ServerAndID]struct{}),
nextSyncPeriod: make(map[request.Server]uint64),
}
}
// rangeLock allows locking sections of an integer space, preventing the syncing
// mechanism from making requests again for sections where a not timed out request
// is already pending or where already fetched and unprocessed data is available.
type rangeLock map[uint64]int
// lock locks or unlocks the given section, depending on the sign of the add parameter.
func (r rangeLock) lock(first, count uint64, add int) {
for i := first; i < first+count; i++ {
if v := r[i] + add; v > 0 {
r[i] = v
} else {
delete(r, i)
}
}
}
// firstUnlocked returns the first unlocked section starting at or after start
// and not longer than maxCount.
func (r rangeLock) firstUnlocked(start, maxCount uint64) (first, count uint64) {
first = start
for {
if _, ok := r[first]; !ok {
break
}
first++
}
for {
count++
if count == maxCount {
break
}
if _, ok := r[first+count]; ok {
break
}
}
return
}
// lockRange locks the range belonging to the given update request, unless the
// same request has already been locked
func (s *ForwardUpdateSync) lockRange(sid request.ServerAndID, req ReqUpdates) {
if _, ok := s.lockedIDs[sid]; ok {
return
}
s.lockedIDs[sid] = struct{}{}
s.rangeLock.lock(req.FirstPeriod, req.Count, 1)
}
// unlockRange unlocks the range belonging to the given update request, unless
// same request has already been unlocked
func (s *ForwardUpdateSync) unlockRange(sid request.ServerAndID, req ReqUpdates) {
if _, ok := s.lockedIDs[sid]; !ok {
return
}
delete(s.lockedIDs, sid)
s.rangeLock.lock(req.FirstPeriod, req.Count, -1)
}
// verifyRange returns true if the number of updates and the individual update
// periods in the response match the requested section.
func (s *ForwardUpdateSync) verifyRange(request ReqUpdates, response RespUpdates) bool {
if uint64(len(response.Updates)) != request.Count || uint64(len(response.Committees)) != request.Count {
return false
}
for i, update := range response.Updates {
if update.AttestedHeader.Header.SyncPeriod() != request.FirstPeriod+uint64(i) {
return false
}
}
return true
}
// updateResponse is a response that has passed initial verification and has been
// queued for processing. Note that an update response cannot be processed until
// the previous updates have also been added to the chain.
type updateResponse struct {
sid request.ServerAndID
request ReqUpdates
response RespUpdates
}
// updateResponseList implements sort.Sort and sorts update request/response events by FirstPeriod.
type updateResponseList []updateResponse
func (u updateResponseList) Len() int { return len(u) }
func (u updateResponseList) Swap(i, j int) { u[i], u[j] = u[j], u[i] }
func (u updateResponseList) Less(i, j int) bool {
return u[i].request.FirstPeriod < u[j].request.FirstPeriod
}
// Process implements request.Module.
func (s *ForwardUpdateSync) Process(requester request.Requester, events []request.Event) {
for _, event := range events {
switch event.Type {
case request.EvResponse, request.EvFail, request.EvTimeout:
sid, rq, rs := event.RequestInfo()
req := rq.(ReqUpdates)
var queued bool
if event.Type == request.EvResponse {
resp := rs.(RespUpdates)
if s.verifyRange(req, resp) {
// there is a response with a valid format; put it in the process queue
s.processQueue = append(s.processQueue, updateResponse{sid: sid, request: req, response: resp})
s.lockRange(sid, req)
queued = true
} else {
requester.Fail(event.Server, "invalid update range")
}
}
if !queued {
s.unlockRange(sid, req)
}
case EvNewOptimisticUpdate:
update := event.Data.(types.OptimisticUpdate)
s.nextSyncPeriod[event.Server] = types.SyncPeriod(update.SignatureSlot + 256)
case request.EvUnregistered:
delete(s.nextSyncPeriod, event.Server)
}
}
// try processing ordered list of available responses
sort.Sort(updateResponseList(s.processQueue))
for s.processQueue != nil {
u := s.processQueue[0]
if !s.processResponse(requester, u) {
break
}
s.unlockRange(u.sid, u.request)
s.processQueue = s.processQueue[1:]
if len(s.processQueue) == 0 {
s.processQueue = nil
}
}
// start new requests if possible
startPeriod, chainInit := s.chain.NextSyncPeriod()
if !chainInit {
return
}
for {
firstPeriod, maxCount := s.rangeLock.firstUnlocked(startPeriod, maxUpdateRequest)
var (
sendTo request.Server
bestCount uint64
)
for _, server := range requester.CanSendTo() {
nextPeriod := s.nextSyncPeriod[server]
if nextPeriod <= firstPeriod {
continue
}
count := maxCount
if nextPeriod < firstPeriod+maxCount {
count = nextPeriod - firstPeriod
}
if count > bestCount {
sendTo, bestCount = server, count
}
}
if sendTo == nil {
return
}
req := ReqUpdates{FirstPeriod: firstPeriod, Count: bestCount}
id := requester.Send(sendTo, req)
s.lockRange(request.ServerAndID{Server: sendTo, ID: id}, req)
}
}
// processResponse adds the fetched updates and committees to the committee chain.
// Returns true in case of full or partial success.
func (s *ForwardUpdateSync) processResponse(requester request.Requester, u updateResponse) (success bool) {
for i, update := range u.response.Updates {
if err := s.chain.InsertUpdate(update, u.response.Committees[i]); err != nil {
if err == light.ErrInvalidPeriod {
// there is a gap in the update periods; stop processing without
// failing and try again next time
return
}
if err == light.ErrInvalidUpdate || err == light.ErrWrongCommitteeRoot || err == light.ErrCannotReorg {
requester.Fail(u.sid.Server, "invalid update received")
} else {
log.Error("Unexpected InsertUpdate error", "error", err)
}
return
}
success = true
}
return
}