eth/protocols/snap: restore peers to idle pool on request revert (#33790)
Some checks are pending
/ Linux Build (push) Waiting to run
/ Linux Build (arm) (push) Waiting to run
/ Keeper Build (push) Waiting to run
/ Windows Build (push) Waiting to run
/ Docker Image (push) Waiting to run

All five `revert*Request` functions (account, bytecode, storage,
trienode heal, bytecode heal) remove the request from the tracked set
but never restore the peer to its corresponding idle pool. When a
request times out and no response arrives, the peer is permanently lost
from the idle pool, preventing new work from being assigned to it.

In normal operation mode (snap-sync full state) this bug is masked by
pivot movement (which resets idle pools via new Sync() cycles every ~15
minutes) and peer churn (reconnections re-add peers via Register()).
However in scenarios like the one I have running my (partial-stateful
node)[https://github.com/ethereum/go-ethereum/pull/33764] with
long-running sync cycles and few peers, all peers can eventually leak
out of the idle pools, stalling sync entirely.

Fix: after deleting from the request map, restore the peer to its idle
pool if it is still registered (guards against the peer-drop path where
Unregister already removed the peer). This mirrors the pattern used in
all five On* response handlers.


This only seems to manifest in peer-thirstly scenarios as where I find
myself when testing snapsync for the partial-statefull node).
Still, thought was at least good to raise this point. Unsure if required
to discuss or not
This commit is contained in:
CPerezz 2026-02-24 02:14:11 +01:00 committed by GitHub
parent 82fad31540
commit c2e1785a48
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -1699,9 +1699,13 @@ func (s *Syncer) revertAccountRequest(req *accountRequest) {
}
close(req.stale)
// Remove the request from the tracked set
// Remove the request from the tracked set and restore the peer to the
// idle pool so it can be reassigned work (skip if peer already left).
s.lock.Lock()
delete(s.accountReqs, req.id)
if _, ok := s.peers[req.peer]; ok {
s.accountIdlers[req.peer] = struct{}{}
}
s.lock.Unlock()
// If there's a timeout timer still running, abort it and mark the account
@ -1740,9 +1744,13 @@ func (s *Syncer) revertBytecodeRequest(req *bytecodeRequest) {
}
close(req.stale)
// Remove the request from the tracked set
// Remove the request from the tracked set and restore the peer to the
// idle pool so it can be reassigned work (skip if peer already left).
s.lock.Lock()
delete(s.bytecodeReqs, req.id)
if _, ok := s.peers[req.peer]; ok {
s.bytecodeIdlers[req.peer] = struct{}{}
}
s.lock.Unlock()
// If there's a timeout timer still running, abort it and mark the code
@ -1781,9 +1789,13 @@ func (s *Syncer) revertStorageRequest(req *storageRequest) {
}
close(req.stale)
// Remove the request from the tracked set
// Remove the request from the tracked set and restore the peer to the
// idle pool so it can be reassigned work (skip if peer already left).
s.lock.Lock()
delete(s.storageReqs, req.id)
if _, ok := s.peers[req.peer]; ok {
s.storageIdlers[req.peer] = struct{}{}
}
s.lock.Unlock()
// If there's a timeout timer still running, abort it and mark the storage
@ -1826,9 +1838,13 @@ func (s *Syncer) revertTrienodeHealRequest(req *trienodeHealRequest) {
}
close(req.stale)
// Remove the request from the tracked set
// Remove the request from the tracked set and restore the peer to the
// idle pool so it can be reassigned work (skip if peer already left).
s.lock.Lock()
delete(s.trienodeHealReqs, req.id)
if _, ok := s.peers[req.peer]; ok {
s.trienodeHealIdlers[req.peer] = struct{}{}
}
s.lock.Unlock()
// If there's a timeout timer still running, abort it and mark the trie node
@ -1867,9 +1883,13 @@ func (s *Syncer) revertBytecodeHealRequest(req *bytecodeHealRequest) {
}
close(req.stale)
// Remove the request from the tracked set
// Remove the request from the tracked set and restore the peer to the
// idle pool so it can be reassigned work (skip if peer already left).
s.lock.Lock()
delete(s.bytecodeHealReqs, req.id)
if _, ok := s.peers[req.peer]; ok {
s.bytecodeHealIdlers[req.peer] = struct{}{}
}
s.lock.Unlock()
// If there's a timeout timer still running, abort it and mark the code