core/state/snapshot: make diskLayer.stopGeneration idempotent

Fixes the deadlock noted by the "TODO this function will hang if it's
called twice" markers in Tree.Disable and Tree.Rebuild. Both paths walk
every layer in the tree and unconditionally invoke stopGeneration on
each disk layer, so a layer reachable through both must tolerate a
second call.

The previous implementation sent on the unbuffered genAbort channel
whenever genMarker was non-nil. After the first abort handshake the
generator goroutine exits, but genMarker is only cleared on the
successful-completion path (generate.go), not on the aborted-mid-flight
path. A second stopGeneration therefore saw generating=true, sent on
genAbort with no receiver, and blocked forever.

Wrap the abort handshake in a sync.Once on the disk layer. The first
call drives the handshake exactly as before; subsequent calls are
no-ops. Remove both TODO comments now that the contract is honoured.

Adds TestStopGenerationIdempotent: stands in a mock generator goroutine
that consumes a single abort, calls stopGeneration twice, and fails
with an explicit message at a 5s deadline rather than hanging until
the test runner's outer timeout.

Fixes #33233.
This commit is contained in:
ozpool 2026-05-13 13:01:45 +05:30
parent 21c5a287f9
commit 963d78dfcc
3 changed files with 71 additions and 16 deletions

View file

@ -41,6 +41,7 @@ type diskLayer struct {
genMarker []byte // Marker for the state that's indexed during initial layer generation
genPending chan struct{} // Notification channel when generation is done (test synchronicity)
genAbort chan chan *generatorStats // Notification channel to abort generating the snapshot in this layer
genStop sync.Once // Guards stopGeneration so it can be called multiple times safely
lock sync.RWMutex
}
@ -184,17 +185,24 @@ func (dl *diskLayer) Update(blockHash common.Hash, accounts map[common.Hash][]by
return newDiffLayer(dl, blockHash, accounts, storage)
}
// stopGeneration aborts the state snapshot generation if it is currently running.
// stopGeneration aborts the state snapshot generation if it is currently
// running. It is safe to call multiple times: only the first call dispatches
// an abort signal to the generator; subsequent calls are no-ops. Tree.Disable
// and Tree.Rebuild both invoke this on every disk layer, so a layer that is
// reached through both paths must not deadlock on the second send to the
// unbuffered genAbort channel.
func (dl *diskLayer) stopGeneration() {
dl.lock.RLock()
generating := dl.genMarker != nil
dl.lock.RUnlock()
if !generating {
return
}
if dl.genAbort != nil {
abort := make(chan *generatorStats)
dl.genAbort <- abort
<-abort
}
dl.genStop.Do(func() {
dl.lock.RLock()
generating := dl.genMarker != nil
dl.lock.RUnlock()
if !generating {
return
}
if dl.genAbort != nil {
abort := make(chan *generatorStats)
dl.genAbort <- abort
<-abort
}
})
}

View file

@ -19,6 +19,7 @@ package snapshot
import (
"bytes"
"testing"
"time"
"github.com/VictoriaMetrics/fastcache"
"github.com/ethereum/go-ethereum/common"
@ -586,3 +587,53 @@ func TestDiskSeek(t *testing.T) {
}
}
}
// TestStopGenerationIdempotent verifies that stopGeneration can be invoked
// multiple times on the same disk layer without deadlocking. Tree.Disable and
// Tree.Rebuild both walk every disk layer and call this, so a layer that is
// reachable from both paths must not hang on the second call's send to the
// unbuffered genAbort channel after the generator goroutine has already exited.
// Regression test for issue #33233.
func TestStopGenerationIdempotent(t *testing.T) {
t.Parallel()
abortCh := make(chan chan *generatorStats)
dl := &diskLayer{
diskdb: rawdb.NewMemoryDatabase(),
genMarker: []byte{}, // non-nil signals generation in progress
genAbort: abortCh,
}
// Stand in for the generator goroutine: receive on genAbort exactly once,
// then exit. A second send by stopGeneration would deadlock the test.
generatorExited := make(chan struct{})
go func() {
defer close(generatorExited)
ack := <-abortCh
ack <- &generatorStats{}
}()
// First call drives the abort handshake.
dl.stopGeneration()
// Generator must have observed the first abort and exited.
select {
case <-generatorExited:
case <-time.After(5 * time.Second):
t.Fatal("first stopGeneration never delivered abort to generator")
}
// Second call must return immediately. Run in a goroutine so the test
// fails with a clear message on regression instead of hanging until the
// outer test timeout fires.
done := make(chan struct{})
go func() {
defer close(done)
dl.stopGeneration()
}()
select {
case <-done:
case <-time.After(5 * time.Second):
t.Fatal("second stopGeneration deadlocked sending to genAbort after the generator had exited")
}
}

View file

@ -258,8 +258,6 @@ func (t *Tree) Disable() {
for _, layer := range t.layers {
switch layer := layer.(type) {
case *diskLayer:
// TODO this function will hang if it's called twice. Will
// fix it in the following PRs.
layer.stopGeneration()
layer.markStale()
layer.Release()
@ -702,8 +700,6 @@ func (t *Tree) Rebuild(root common.Hash) {
for _, layer := range t.layers {
switch layer := layer.(type) {
case *diskLayer:
// TODO this function will hang if it's called twice. Will
// fix it in the following PRs.
layer.stopGeneration()
layer.markStale()
layer.Release()