triedb/pathdb: replace crit panic shim with error propagation through Flush

Addresses review finding I2. The bintrieFlatCodec had a crit() helper whose doc claimed "delegates to log.Crit" but whose body was panic(fmt.Sprintf(...)). A corrupt on-disk stem blob would cause the buffer flush goroutine to panic, killing the process. On restart the same blob would cause the same panic — unrecoverable crash loop. Fix: applyWrites now returns ([]byte, error) instead of panicking. The Flush method on flatStateCodec gains an error return: (int, int) → (int, int, error) The error propagates up through writeStates → stateSet.write → buffer.flush → flushErr. A corrupted stem blob now causes a flush failure that the database can react to instead of a crash loop. The per-entry methods (WriteAccount, WriteStorage, DeleteAccount, DeleteStorage) — which are NOT on the production flush path — use log.Crit (the real function, not the deleted shim) on error, matching the merkle codec's existing convention for unrecoverable corruption at the per-entry level. The crit shim is deleted entirely.
2026-06-12 01:41:36 +00:00 · 2026-04-09 12:17:59 +02:00 · 2026-04-09 12:17:59 +02:00 · 69c0028094
commit 69c0028094
parent 149277be3c
7 changed files with 38 additions and 37 deletions
--- a/triedb/pathdb/buffer.go
+++ b/triedb/pathdb/buffer.go
@ -173,7 +173,11 @@ func (b *buffer) flush(root common.Hash, db ethdb.KeyValueStore, codec flatState
 			return
 		}
 		nodes := b.nodes.write(batch, nodesCache)
-		accounts, slots := b.states.write(batch, codec, progress, statesCache)
+		accounts, slots, flushErr := b.states.write(batch, codec, progress, statesCache)
+		if flushErr != nil {
+			b.flushErr = flushErr
+			return
+		}
 		rawdb.WritePersistentStateID(batch, id)
 		rawdb.WriteSnapshotRoot(batch, root)

--- a/triedb/pathdb/disklayer.go
+++ b/triedb/pathdb/disklayer.go
@ -633,7 +633,9 @@ func (dl *diskLayer) revert(h *stateHistory) (*diskLayer, error) {
 	writeNodes(batch, nodes, dl.nodes)

 	// Provide the original values of modified accounts and storages for revert
-	writeStates(batch, dl.db.flatCodec, progress, accounts, storages, dl.states)
+	if _, _, err := writeStates(batch, dl.db.flatCodec, progress, accounts, storages, dl.states); err != nil {
+		return nil, err
+	}
 	rawdb.WritePersistentStateID(batch, dl.id-1)
 	rawdb.WriteSnapshotRoot(batch, h.meta.parent)
 	if err := batch.Write(); err != nil {
--- a/triedb/pathdb/flat_codec.go
+++ b/triedb/pathdb/flat_codec.go
@ -152,7 +152,7 @@ type flatStateCodec interface {
 	// reporting; the merkle codec reports one per map entry, while the
 	// bintrie codec reports one per logical offset write (so the metrics
 	// remain comparable across schemes).
-	Flush(batch ethdb.Batch, genMarker []byte, accountData map[common.Hash][]byte, storageData map[common.Hash]map[common.Hash][]byte, clean *fastcache.Cache) (int, int)
+	Flush(batch ethdb.Batch, genMarker []byte, accountData map[common.Hash][]byte, storageData map[common.Hash]map[common.Hash][]byte, clean *fastcache.Cache) (int, int, error)
 }

 // merkleFlatCodec implements flatStateCodec for the keccak-keyed MPT flat
@ -254,7 +254,7 @@ func (c *merkleFlatCodec) StorageMarkerKey(accountHash, storageHash common.Hash)
 // This is the implementation that previously lived directly in writeStates.
 // It has been moved into the codec so the bintrie codec can supply its own
 // per-stem aggregating implementation alongside this one.
-func (c *merkleFlatCodec) Flush(batch ethdb.Batch, genMarker []byte, accountData map[common.Hash][]byte, storageData map[common.Hash]map[common.Hash][]byte, clean *fastcache.Cache) (int, int) {
+func (c *merkleFlatCodec) Flush(batch ethdb.Batch, genMarker []byte, accountData map[common.Hash][]byte, storageData map[common.Hash]map[common.Hash][]byte, clean *fastcache.Cache) (int, int, error) {
 	var (
 		accounts int
 		slots    int
@ -311,5 +311,5 @@ func (c *merkleFlatCodec) Flush(batch ethdb.Batch, genMarker []byte, accountData
 			}
 		}
 	}
-	return accounts, slots
+	return accounts, slots, nil
 }
--- a/triedb/pathdb/flat_codec_bintrie.go
+++ b/triedb/pathdb/flat_codec_bintrie.go
@ -24,6 +24,7 @@ import (
 	"github.com/ethereum/go-ethereum/common"
 	"github.com/ethereum/go-ethereum/core/rawdb"
 	"github.com/ethereum/go-ethereum/ethdb"
+	"github.com/ethereum/go-ethereum/log"
 	"github.com/ethereum/go-ethereum/trie/bintrie"
 )

@ -208,10 +209,11 @@ func (c *bintrieFlatCodec) ReadStorage(db ethdb.KeyValueReader, _ common.Hash, s
 func (c *bintrieFlatCodec) WriteAccount(batch ethdb.Batch, key common.Hash, blob []byte) {
 	writes, err := splitAccountBlob(blob)
 	if err != nil {
-		crit("bintrie WriteAccount: %v", err)
-		return
+		log.Crit("bintrie WriteAccount: split failed", "key", key, "err", err)
+	}
+	if _, err := c.applyWrites(batch, stemFromKey(key), writes); err != nil {
+		log.Crit("bintrie WriteAccount: apply failed", "key", key, "err", err)
 	}
-	c.applyWrites(batch, stemFromKey(key), writes)
 }

 // DeleteAccount clears offsets 0 (BasicData) and 1 (CodeHash) at the
@ -224,7 +226,9 @@ func (c *bintrieFlatCodec) DeleteAccount(batch ethdb.Batch, key common.Hash) {
 		{Offset: bintrie.BasicDataLeafKey, Value: nil},
 		{Offset: bintrie.CodeHashLeafKey, Value: nil},
 	}
-	c.applyWrites(batch, stemFromKey(key), writes)
+	if _, err := c.applyWrites(batch, stemFromKey(key), writes); err != nil {
+		log.Crit("bintrie DeleteAccount: apply failed", "key", key, "err", err)
+	}
 }

 // WriteStorage writes a single storage-slot value. The blob must be 32
@ -234,18 +238,21 @@ func (c *bintrieFlatCodec) DeleteAccount(batch ethdb.Batch, key common.Hash) {
 // The first parameter (accountKey) is ignored — see StorageKey.
 func (c *bintrieFlatCodec) WriteStorage(batch ethdb.Batch, _ common.Hash, storageKey common.Hash, blob []byte) {
 	if len(blob) != stemBlobValueSize {
-		crit("bintrie WriteStorage: value has len %d, want %d", len(blob), stemBlobValueSize)
-		return
+		log.Crit("bintrie WriteStorage: wrong value length", "key", storageKey, "len", len(blob), "want", stemBlobValueSize)
 	}
 	writes := []stemOffsetValue{{Offset: offsetFromKey(storageKey), Value: blob}}
-	c.applyWrites(batch, stemFromKey(storageKey), writes)
+	if _, err := c.applyWrites(batch, stemFromKey(storageKey), writes); err != nil {
+		log.Crit("bintrie WriteStorage: apply failed", "key", storageKey, "err", err)
+	}
 }

 // DeleteStorage clears a single offset at a stem. If the stem has no
 // other populated offsets afterwards, the key is removed entirely.
 func (c *bintrieFlatCodec) DeleteStorage(batch ethdb.Batch, _ common.Hash, storageKey common.Hash) {
 	writes := []stemOffsetValue{{Offset: offsetFromKey(storageKey), Value: nil}}
-	c.applyWrites(batch, stemFromKey(storageKey), writes)
+	if _, err := c.applyWrites(batch, stemFromKey(storageKey), writes); err != nil {
+		log.Crit("bintrie DeleteStorage: apply failed", "key", storageKey, "err", err)
+	}
 }

 // applyWrites performs a read-modify-write on the given stem: reads the
@ -263,19 +270,18 @@ func (c *bintrieFlatCodec) DeleteStorage(batch ethdb.Batch, _ common.Hash, stora
 // call for the same stem within a flush would re-read the pre-flush
 // state; see the pre-aggregation requirement documented on
 // bintrieFlatCodec.
-func (c *bintrieFlatCodec) applyWrites(batch ethdb.Batch, stem []byte, writes []stemOffsetValue) []byte {
+func (c *bintrieFlatCodec) applyWrites(batch ethdb.Batch, stem []byte, writes []stemOffsetValue) ([]byte, error) {
 	existing := rawdb.ReadBinTrieStem(c.db, stem)
 	merged, err := mergeStemBlob(existing, writes)
 	if err != nil {
-		crit("bintrie applyWrites: %v", err)
-		return nil
+		return nil, fmt.Errorf("bintrie stem %x: %w", stem, err)
 	}
 	if merged == nil {
 		rawdb.DeleteBinTrieStem(batch, stem)
-		return nil
+		return nil, nil
 	}
 	rawdb.WriteBinTrieStem(batch, stem, merged)
-	return merged
+	return merged, nil
 }

 // splitAccountBlob validates and splits the two-slot account payload
@ -442,7 +448,7 @@ func (c *bintrieFlatCodec) StorageMarkerKey(_ common.Hash, storageHash common.Ha
 //
 // Returns (offset count from accountData, offset count from storageData)
 // for metric reporting parity with the merkle path.
-func (c *bintrieFlatCodec) Flush(batch ethdb.Batch, genMarker []byte, accountData map[common.Hash][]byte, storageData map[common.Hash]map[common.Hash][]byte, clean *fastcache.Cache) (int, int) {
+func (c *bintrieFlatCodec) Flush(batch ethdb.Batch, genMarker []byte, accountData map[common.Hash][]byte, storageData map[common.Hash]map[common.Hash][]byte, clean *fastcache.Cache) (int, int, error) {
 	// Aggregate per-offset writes into per-stem batches. We use
 	// [31]byte as the map key because byte slices aren't hashable in
 	// Go and the stem is fixed size; the alternative (common.Hash with
@ -498,7 +504,9 @@ func (c *bintrieFlatCodec) Flush(batch ethdb.Batch, genMarker []byte, accountDat
 	// will fall through to the trie reader on this per feedback #3 /
 	// Commit A2); a 32-byte value means the offset is populated.
 	for _, ag := range aggregated {
-		c.applyWrites(batch, ag.fullKeys[0][:bintrie.StemSize], ag.writes)
+		if _, err := c.applyWrites(batch, ag.fullKeys[0][:bintrie.StemSize], ag.writes); err != nil {
+			return accountWrites, storageWrites, fmt.Errorf("bintrie Flush: %w", err)
+		}
 		if clean == nil {
 			continue
 		}
@ -507,19 +515,6 @@ func (c *bintrieFlatCodec) Flush(batch ethdb.Batch, genMarker []byte, accountDat
 			clean.Set(cacheKey, ag.writes[i].Value)
 		}
 	}
-	return accountWrites, storageWrites
+	return accountWrites, storageWrites, nil
 }

-// crit is a shim around log.Crit that allows tests to replace the fatal
-// behavior with a panic if needed. Defined at the package level to match
-// the single-call-per-error style used by the merkle codec.
-func crit(format string, args ...any) {
-	// Import cycle avoidance: we delegate to log.Crit via the existing
-	// import in this package (see flat_codec.go for the merkle codec,
-	// which uses log.Crit through rawdb's own accessors).
-	// Here we keep the dependency light by just panicking; production
-	// flat-state corruption is unrecoverable and panicking surfaces the
-	// issue immediately rather than letting a silently-corrupted state
-	// root propagate.
-	panic(fmt.Sprintf(format, args...))
-}
--- a/triedb/pathdb/flat_codec_bintrie_test.go
+++ b/triedb/pathdb/flat_codec_bintrie_test.go
@ -327,7 +327,7 @@ func TestBintrieCodecFlushAggregates(t *testing.T) {
 	}

 	batch := db.NewBatch()
-	accW, stoW := codec.Flush(batch, nil, accountData, nil, nil)
+	accW, stoW, _ := codec.Flush(batch, nil, accountData, nil, nil)
 	flushBatch(t, batch)

 	if accW != 4 {
--- a/triedb/pathdb/flush.go
+++ b/triedb/pathdb/flush.go
@ -79,6 +79,6 @@ func writeNodes(batch ethdb.Batch, nodes map[common.Hash]map[string]*trienode.No
 // TODO(rjl493456442) do we really need this generation marker? The state updates
 // after the marker can also be written and will be fixed by generator later if
 // it's outdated.
-func writeStates(batch ethdb.Batch, codec flatStateCodec, genMarker []byte, accountData map[common.Hash][]byte, storageData map[common.Hash]map[common.Hash][]byte, clean *fastcache.Cache) (int, int) {
+func writeStates(batch ethdb.Batch, codec flatStateCodec, genMarker []byte, accountData map[common.Hash][]byte, storageData map[common.Hash]map[common.Hash][]byte, clean *fastcache.Cache) (int, int, error) {
 	return codec.Flush(batch, genMarker, accountData, storageData, clean)
 }
--- a/triedb/pathdb/states.go
+++ b/triedb/pathdb/states.go
@ -423,7 +423,7 @@ func (s *stateSet) decode(r *rlp.Stream) error {
 }

 // write flushes state mutations into the provided database batch as a whole.
-func (s *stateSet) write(batch ethdb.Batch, codec flatStateCodec, genMarker []byte, clean *fastcache.Cache) (int, int) {
+func (s *stateSet) write(batch ethdb.Batch, codec flatStateCodec, genMarker []byte, clean *fastcache.Cache) (int, int, error) {
 	return writeStates(batch, codec, genMarker, s.accountData, s.storageData, clean)
 }