From c2e1785a48f12f1a4f8d533fc2ed5468b6241e92 Mon Sep 17 00:00:00 2001 From: CPerezz <37264926+CPerezz@users.noreply.github.com> Date: Tue, 24 Feb 2026 02:14:11 +0100 Subject: [PATCH] eth/protocols/snap: restore peers to idle pool on request revert (#33790) All five `revert*Request` functions (account, bytecode, storage, trienode heal, bytecode heal) remove the request from the tracked set but never restore the peer to its corresponding idle pool. When a request times out and no response arrives, the peer is permanently lost from the idle pool, preventing new work from being assigned to it. In normal operation mode (snap-sync full state) this bug is masked by pivot movement (which resets idle pools via new Sync() cycles every ~15 minutes) and peer churn (reconnections re-add peers via Register()). However in scenarios like the one I have running my (partial-stateful node)[https://github.com/ethereum/go-ethereum/pull/33764] with long-running sync cycles and few peers, all peers can eventually leak out of the idle pools, stalling sync entirely. Fix: after deleting from the request map, restore the peer to its idle pool if it is still registered (guards against the peer-drop path where Unregister already removed the peer). This mirrors the pattern used in all five On* response handlers. This only seems to manifest in peer-thirstly scenarios as where I find myself when testing snapsync for the partial-statefull node). Still, thought was at least good to raise this point. Unsure if required to discuss or not --- eth/protocols/snap/sync.go | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/eth/protocols/snap/sync.go b/eth/protocols/snap/sync.go index 08e85c896a..841bfb446e 100644 --- a/eth/protocols/snap/sync.go +++ b/eth/protocols/snap/sync.go @@ -1699,9 +1699,13 @@ func (s *Syncer) revertAccountRequest(req *accountRequest) { } close(req.stale) - // Remove the request from the tracked set + // Remove the request from the tracked set and restore the peer to the + // idle pool so it can be reassigned work (skip if peer already left). s.lock.Lock() delete(s.accountReqs, req.id) + if _, ok := s.peers[req.peer]; ok { + s.accountIdlers[req.peer] = struct{}{} + } s.lock.Unlock() // If there's a timeout timer still running, abort it and mark the account @@ -1740,9 +1744,13 @@ func (s *Syncer) revertBytecodeRequest(req *bytecodeRequest) { } close(req.stale) - // Remove the request from the tracked set + // Remove the request from the tracked set and restore the peer to the + // idle pool so it can be reassigned work (skip if peer already left). s.lock.Lock() delete(s.bytecodeReqs, req.id) + if _, ok := s.peers[req.peer]; ok { + s.bytecodeIdlers[req.peer] = struct{}{} + } s.lock.Unlock() // If there's a timeout timer still running, abort it and mark the code @@ -1781,9 +1789,13 @@ func (s *Syncer) revertStorageRequest(req *storageRequest) { } close(req.stale) - // Remove the request from the tracked set + // Remove the request from the tracked set and restore the peer to the + // idle pool so it can be reassigned work (skip if peer already left). s.lock.Lock() delete(s.storageReqs, req.id) + if _, ok := s.peers[req.peer]; ok { + s.storageIdlers[req.peer] = struct{}{} + } s.lock.Unlock() // If there's a timeout timer still running, abort it and mark the storage @@ -1826,9 +1838,13 @@ func (s *Syncer) revertTrienodeHealRequest(req *trienodeHealRequest) { } close(req.stale) - // Remove the request from the tracked set + // Remove the request from the tracked set and restore the peer to the + // idle pool so it can be reassigned work (skip if peer already left). s.lock.Lock() delete(s.trienodeHealReqs, req.id) + if _, ok := s.peers[req.peer]; ok { + s.trienodeHealIdlers[req.peer] = struct{}{} + } s.lock.Unlock() // If there's a timeout timer still running, abort it and mark the trie node @@ -1867,9 +1883,13 @@ func (s *Syncer) revertBytecodeHealRequest(req *bytecodeHealRequest) { } close(req.stale) - // Remove the request from the tracked set + // Remove the request from the tracked set and restore the peer to the + // idle pool so it can be reassigned work (skip if peer already left). s.lock.Lock() delete(s.bytecodeHealReqs, req.id) + if _, ok := s.peers[req.peer]; ok { + s.bytecodeHealIdlers[req.peer] = struct{}{} + } s.lock.Unlock() // If there's a timeout timer still running, abort it and mark the code