eth/protocols/snap: restore peers to idle pool on request revert (#33790)

All five `revert*Request` functions (account, bytecode, storage, trienode heal, bytecode heal) remove the request from the tracked set but never restore the peer to its corresponding idle pool. When a request times out and no response arrives, the peer is permanently lost from the idle pool, preventing new work from being assigned to it. In normal operation mode (snap-sync full state) this bug is masked by pivot movement (which resets idle pools via new Sync() cycles every ~15 minutes) and peer churn (reconnections re-add peers via Register()). However in scenarios like the one I have running my (partial-stateful node)[https://github.com/ethereum/go-ethereum/pull/33764] with long-running sync cycles and few peers, all peers can eventually leak out of the idle pools, stalling sync entirely. Fix: after deleting from the request map, restore the peer to its idle pool if it is still registered (guards against the peer-drop path where Unregister already removed the peer). This mirrors the pattern used in all five On* response handlers. This only seems to manifest in peer-thirstly scenarios as where I find myself when testing snapsync for the partial-statefull node). Still, thought was at least good to raise this point. Unsure if required to discuss or not
2026-07-12 16:04:27 +00:00 · 2026-02-24 02:14:11 +01:00 · 2026-02-24 02:14:11 +01:00 · c2e1785a48
commit c2e1785a48
parent 82fad31540
1 changed files with 25 additions and 5 deletions
--- a/eth/protocols/snap/sync.go
+++ b/eth/protocols/snap/sync.go
@ -1699,9 +1699,13 @@ func (s *Syncer) revertAccountRequest(req *accountRequest) {
 	}
 	close(req.stale)

-	// Remove the request from the tracked set
+	// Remove the request from the tracked set and restore the peer to the
+	// idle pool so it can be reassigned work (skip if peer already left).
 	s.lock.Lock()
 	delete(s.accountReqs, req.id)
+	if _, ok := s.peers[req.peer]; ok {
+		s.accountIdlers[req.peer] = struct{}{}
+	}
 	s.lock.Unlock()

 	// If there's a timeout timer still running, abort it and mark the account
@ -1740,9 +1744,13 @@ func (s *Syncer) revertBytecodeRequest(req *bytecodeRequest) {
 	}
 	close(req.stale)

-	// Remove the request from the tracked set
+	// Remove the request from the tracked set and restore the peer to the
+	// idle pool so it can be reassigned work (skip if peer already left).
 	s.lock.Lock()
 	delete(s.bytecodeReqs, req.id)
+	if _, ok := s.peers[req.peer]; ok {
+		s.bytecodeIdlers[req.peer] = struct{}{}
+	}
 	s.lock.Unlock()

 	// If there's a timeout timer still running, abort it and mark the code
@ -1781,9 +1789,13 @@ func (s *Syncer) revertStorageRequest(req *storageRequest) {
 	}
 	close(req.stale)

-	// Remove the request from the tracked set
+	// Remove the request from the tracked set and restore the peer to the
+	// idle pool so it can be reassigned work (skip if peer already left).
 	s.lock.Lock()
 	delete(s.storageReqs, req.id)
+	if _, ok := s.peers[req.peer]; ok {
+		s.storageIdlers[req.peer] = struct{}{}
+	}
 	s.lock.Unlock()

 	// If there's a timeout timer still running, abort it and mark the storage
@ -1826,9 +1838,13 @@ func (s *Syncer) revertTrienodeHealRequest(req *trienodeHealRequest) {
 	}
 	close(req.stale)

-	// Remove the request from the tracked set
+	// Remove the request from the tracked set and restore the peer to the
+	// idle pool so it can be reassigned work (skip if peer already left).
 	s.lock.Lock()
 	delete(s.trienodeHealReqs, req.id)
+	if _, ok := s.peers[req.peer]; ok {
+		s.trienodeHealIdlers[req.peer] = struct{}{}
+	}
 	s.lock.Unlock()

 	// If there's a timeout timer still running, abort it and mark the trie node
@ -1867,9 +1883,13 @@ func (s *Syncer) revertBytecodeHealRequest(req *bytecodeHealRequest) {
 	}
 	close(req.stale)

-	// Remove the request from the tracked set
+	// Remove the request from the tracked set and restore the peer to the
+	// idle pool so it can be reassigned work (skip if peer already left).
 	s.lock.Lock()
 	delete(s.bytecodeHealReqs, req.id)
+	if _, ok := s.peers[req.peer]; ok {
+		s.bytecodeHealIdlers[req.peer] = struct{}{}
+	}
 	s.lock.Unlock()

 	// If there's a timeout timer still running, abort it and mark the code