go-ethereum/core/state/snapshot/disklayer.go
ozpool 963d78dfcc core/state/snapshot: make diskLayer.stopGeneration idempotent
Fixes the deadlock noted by the "TODO this function will hang if it's
called twice" markers in Tree.Disable and Tree.Rebuild. Both paths walk
every layer in the tree and unconditionally invoke stopGeneration on
each disk layer, so a layer reachable through both must tolerate a
second call.

The previous implementation sent on the unbuffered genAbort channel
whenever genMarker was non-nil. After the first abort handshake the
generator goroutine exits, but genMarker is only cleared on the
successful-completion path (generate.go), not on the aborted-mid-flight
path. A second stopGeneration therefore saw generating=true, sent on
genAbort with no receiver, and blocked forever.

Wrap the abort handshake in a sync.Once on the disk layer. The first
call drives the handshake exactly as before; subsequent calls are
no-ops. Remove both TODO comments now that the contract is honoured.

Adds TestStopGenerationIdempotent: stands in a mock generator goroutine
that consumes a single abort, calls stopGeneration twice, and fails
with an explicit message at a 5s deadline rather than hanging until
the test runner's outer timeout.

Fixes #33233.
2026-05-13 13:01:45 +05:30

208 lines
6.9 KiB
Go

// Copyright 2019 The go-ethereum Authors
// This file is part of the go-ethereum library.
//
// The go-ethereum library is free software: you can redistribute it and/or modify
// it under the terms of the GNU Lesser General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// The go-ethereum library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
package snapshot
import (
"bytes"
"sync"
"github.com/VictoriaMetrics/fastcache"
"github.com/ethereum/go-ethereum/common"
"github.com/ethereum/go-ethereum/core/rawdb"
"github.com/ethereum/go-ethereum/core/types"
"github.com/ethereum/go-ethereum/ethdb"
"github.com/ethereum/go-ethereum/rlp"
"github.com/ethereum/go-ethereum/triedb"
)
// diskLayer is a low level persistent snapshot built on top of a key-value store.
type diskLayer struct {
diskdb ethdb.KeyValueStore // Key-value store containing the base snapshot
triedb *triedb.Database // Trie node cache for reconstruction purposes
cache *fastcache.Cache // Cache to avoid hitting the disk for direct access
root common.Hash // Root hash of the base snapshot
stale bool // Signals that the layer became stale (state progressed)
genMarker []byte // Marker for the state that's indexed during initial layer generation
genPending chan struct{} // Notification channel when generation is done (test synchronicity)
genAbort chan chan *generatorStats // Notification channel to abort generating the snapshot in this layer
genStop sync.Once // Guards stopGeneration so it can be called multiple times safely
lock sync.RWMutex
}
// Release releases underlying resources; specifically the fastcache requires
// Reset() in order to not leak memory.
// OBS: It does not invoke Close on the diskdb
func (dl *diskLayer) Release() error {
if dl.cache != nil {
dl.cache.Reset()
}
return nil
}
// Root returns root hash for which this snapshot was made.
func (dl *diskLayer) Root() common.Hash {
return dl.root
}
// Parent always returns nil as there's no layer below the disk.
func (dl *diskLayer) Parent() snapshot {
return nil
}
// Stale return whether this layer has become stale (was flattened across) or if
// it's still live.
func (dl *diskLayer) Stale() bool {
dl.lock.RLock()
defer dl.lock.RUnlock()
return dl.stale
}
// markStale sets the stale flag as true.
func (dl *diskLayer) markStale() {
dl.lock.Lock()
defer dl.lock.Unlock()
dl.stale = true
}
// Account directly retrieves the account associated with a particular hash in
// the snapshot slim data format.
func (dl *diskLayer) Account(hash common.Hash) (*types.SlimAccount, error) {
data, err := dl.AccountRLP(hash)
if err != nil {
return nil, err
}
if len(data) == 0 { // can be both nil and []byte{}
return nil, nil
}
account := new(types.SlimAccount)
if err := rlp.DecodeBytes(data, account); err != nil {
panic(err)
}
return account, nil
}
// AccountRLP directly retrieves the account RLP associated with a particular
// hash in the snapshot slim data format.
func (dl *diskLayer) AccountRLP(hash common.Hash) ([]byte, error) {
dl.lock.RLock()
defer dl.lock.RUnlock()
// If the layer was flattened into, consider it invalid (any live reference to
// the original should be marked as unusable).
if dl.stale {
return nil, ErrSnapshotStale
}
// If the layer is being generated, ensure the requested hash has already been
// covered by the generator.
if dl.genMarker != nil && bytes.Compare(hash[:], dl.genMarker) > 0 {
return nil, ErrNotCoveredYet
}
// If we're in the disk layer, all diff layers missed
snapshotDirtyAccountMissMeter.Mark(1)
// Try to retrieve the account from the memory cache
if blob, found := dl.cache.HasGet(nil, hash[:]); found {
snapshotCleanAccountHitMeter.Mark(1)
snapshotCleanAccountReadMeter.Mark(int64(len(blob)))
return blob, nil
}
// Cache doesn't contain account, pull from disk and cache for later
blob := rawdb.ReadAccountSnapshot(dl.diskdb, hash)
dl.cache.Set(hash[:], blob)
snapshotCleanAccountMissMeter.Mark(1)
if n := len(blob); n > 0 {
snapshotCleanAccountWriteMeter.Mark(int64(n))
} else {
snapshotCleanAccountInexMeter.Mark(1)
}
return blob, nil
}
// Storage directly retrieves the storage data associated with a particular hash,
// within a particular account.
func (dl *diskLayer) Storage(accountHash, storageHash common.Hash) ([]byte, error) {
dl.lock.RLock()
defer dl.lock.RUnlock()
// If the layer was flattened into, consider it invalid (any live reference to
// the original should be marked as unusable).
if dl.stale {
return nil, ErrSnapshotStale
}
key := append(accountHash[:], storageHash[:]...)
// If the layer is being generated, ensure the requested hash has already been
// covered by the generator.
if dl.genMarker != nil && bytes.Compare(key, dl.genMarker) > 0 {
return nil, ErrNotCoveredYet
}
// If we're in the disk layer, all diff layers missed
snapshotDirtyStorageMissMeter.Mark(1)
// Try to retrieve the storage slot from the memory cache
if blob, found := dl.cache.HasGet(nil, key); found {
snapshotCleanStorageHitMeter.Mark(1)
snapshotCleanStorageReadMeter.Mark(int64(len(blob)))
return blob, nil
}
// Cache doesn't contain storage slot, pull from disk and cache for later
blob := rawdb.ReadStorageSnapshot(dl.diskdb, accountHash, storageHash)
dl.cache.Set(key, blob)
snapshotCleanStorageMissMeter.Mark(1)
if n := len(blob); n > 0 {
snapshotCleanStorageWriteMeter.Mark(int64(n))
} else {
snapshotCleanStorageInexMeter.Mark(1)
}
return blob, nil
}
// Update creates a new layer on top of the existing snapshot diff tree with
// the specified data items. Note, the maps are retained by the method to avoid
// copying everything.
func (dl *diskLayer) Update(blockHash common.Hash, accounts map[common.Hash][]byte, storage map[common.Hash]map[common.Hash][]byte) *diffLayer {
return newDiffLayer(dl, blockHash, accounts, storage)
}
// stopGeneration aborts the state snapshot generation if it is currently
// running. It is safe to call multiple times: only the first call dispatches
// an abort signal to the generator; subsequent calls are no-ops. Tree.Disable
// and Tree.Rebuild both invoke this on every disk layer, so a layer that is
// reached through both paths must not deadlock on the second send to the
// unbuffered genAbort channel.
func (dl *diskLayer) stopGeneration() {
dl.genStop.Do(func() {
dl.lock.RLock()
generating := dl.genMarker != nil
dl.lock.RUnlock()
if !generating {
return
}
if dl.genAbort != nil {
abort := make(chan *generatorStats)
dl.genAbort <- abort
<-abort
}
})
}