mirror of
https://github.com/ethereum/go-ethereum.git
synced 2026-02-26 07:37:20 +00:00
eth/protocols/snap: restore peers to idle pool on request revert (#33790)
All five `revert*Request` functions (account, bytecode, storage, trienode heal, bytecode heal) remove the request from the tracked set but never restore the peer to its corresponding idle pool. When a request times out and no response arrives, the peer is permanently lost from the idle pool, preventing new work from being assigned to it. In normal operation mode (snap-sync full state) this bug is masked by pivot movement (which resets idle pools via new Sync() cycles every ~15 minutes) and peer churn (reconnections re-add peers via Register()). However in scenarios like the one I have running my (partial-stateful node)[https://github.com/ethereum/go-ethereum/pull/33764] with long-running sync cycles and few peers, all peers can eventually leak out of the idle pools, stalling sync entirely. Fix: after deleting from the request map, restore the peer to its idle pool if it is still registered (guards against the peer-drop path where Unregister already removed the peer). This mirrors the pattern used in all five On* response handlers. This only seems to manifest in peer-thirstly scenarios as where I find myself when testing snapsync for the partial-statefull node). Still, thought was at least good to raise this point. Unsure if required to discuss or not
This commit is contained in:
parent
82fad31540
commit
c2e1785a48
1 changed files with 25 additions and 5 deletions
|
|
@ -1699,9 +1699,13 @@ func (s *Syncer) revertAccountRequest(req *accountRequest) {
|
|||
}
|
||||
close(req.stale)
|
||||
|
||||
// Remove the request from the tracked set
|
||||
// Remove the request from the tracked set and restore the peer to the
|
||||
// idle pool so it can be reassigned work (skip if peer already left).
|
||||
s.lock.Lock()
|
||||
delete(s.accountReqs, req.id)
|
||||
if _, ok := s.peers[req.peer]; ok {
|
||||
s.accountIdlers[req.peer] = struct{}{}
|
||||
}
|
||||
s.lock.Unlock()
|
||||
|
||||
// If there's a timeout timer still running, abort it and mark the account
|
||||
|
|
@ -1740,9 +1744,13 @@ func (s *Syncer) revertBytecodeRequest(req *bytecodeRequest) {
|
|||
}
|
||||
close(req.stale)
|
||||
|
||||
// Remove the request from the tracked set
|
||||
// Remove the request from the tracked set and restore the peer to the
|
||||
// idle pool so it can be reassigned work (skip if peer already left).
|
||||
s.lock.Lock()
|
||||
delete(s.bytecodeReqs, req.id)
|
||||
if _, ok := s.peers[req.peer]; ok {
|
||||
s.bytecodeIdlers[req.peer] = struct{}{}
|
||||
}
|
||||
s.lock.Unlock()
|
||||
|
||||
// If there's a timeout timer still running, abort it and mark the code
|
||||
|
|
@ -1781,9 +1789,13 @@ func (s *Syncer) revertStorageRequest(req *storageRequest) {
|
|||
}
|
||||
close(req.stale)
|
||||
|
||||
// Remove the request from the tracked set
|
||||
// Remove the request from the tracked set and restore the peer to the
|
||||
// idle pool so it can be reassigned work (skip if peer already left).
|
||||
s.lock.Lock()
|
||||
delete(s.storageReqs, req.id)
|
||||
if _, ok := s.peers[req.peer]; ok {
|
||||
s.storageIdlers[req.peer] = struct{}{}
|
||||
}
|
||||
s.lock.Unlock()
|
||||
|
||||
// If there's a timeout timer still running, abort it and mark the storage
|
||||
|
|
@ -1826,9 +1838,13 @@ func (s *Syncer) revertTrienodeHealRequest(req *trienodeHealRequest) {
|
|||
}
|
||||
close(req.stale)
|
||||
|
||||
// Remove the request from the tracked set
|
||||
// Remove the request from the tracked set and restore the peer to the
|
||||
// idle pool so it can be reassigned work (skip if peer already left).
|
||||
s.lock.Lock()
|
||||
delete(s.trienodeHealReqs, req.id)
|
||||
if _, ok := s.peers[req.peer]; ok {
|
||||
s.trienodeHealIdlers[req.peer] = struct{}{}
|
||||
}
|
||||
s.lock.Unlock()
|
||||
|
||||
// If there's a timeout timer still running, abort it and mark the trie node
|
||||
|
|
@ -1867,9 +1883,13 @@ func (s *Syncer) revertBytecodeHealRequest(req *bytecodeHealRequest) {
|
|||
}
|
||||
close(req.stale)
|
||||
|
||||
// Remove the request from the tracked set
|
||||
// Remove the request from the tracked set and restore the peer to the
|
||||
// idle pool so it can be reassigned work (skip if peer already left).
|
||||
s.lock.Lock()
|
||||
delete(s.bytecodeHealReqs, req.id)
|
||||
if _, ok := s.peers[req.peer]; ok {
|
||||
s.bytecodeHealIdlers[req.peer] = struct{}{}
|
||||
}
|
||||
s.lock.Unlock()
|
||||
|
||||
// If there's a timeout timer still running, abort it and mark the code
|
||||
|
|
|
|||
Loading…
Reference in a new issue