trie/bintrie: skip clean nodes in CollectNodes to reduce commit write amplification (#34754)

## Problem `BinaryTrie.Commit` unconditionally walked every resolved in-memory node and flushed it into the `NodeSet`, producing one Pebble write per resolved internal + stem node on every block — even when the node's on-disk blob was bitwise identical to the previous commit. On a warm 400M-state workload this meant tens of thousands of redundant 65-byte writes per block, compounding Pebble compaction pressure on every commit. The existing `mustRecompute` flag tracks *hash* staleness, not *disk-blob* staleness: after `Hash()` completes, `mustRecompute` is cleared even though the fresh blob has not been persisted. It is therefore insufficient for a skip-flush optimization. ## Fix Mirror the MPT committer pattern (`trie/committer.go:51-56`) by adding a `dirty` flag on `InternalNode` and `StemNode` with the semantics *the on-disk blob is stale*. The flag is: - set to `true` wherever the node is created or structurally modified (the same call sites that already set `mustRecompute = true`); - set to `false` only after the node has been passed to the `flushfn` inside `CollectNodes`; - left `false` on nodes produced by `DeserializeNodeWithHash`, matching the *loaded from disk, already persisted* semantics. `CollectNodes` short-circuits on `!dirty` subtrees. The propagation invariant (an ancestor of any dirty node is itself dirty) is already maintained by the existing `InsertValuesAtStem` / `Insert` paths, which now mirror every `mustRecompute = true` setter with a `dirty = true` setter. ## Benchmark New `BenchmarkCollectNodes_SparseWrite` measures commit cost when only one leaf changes between blocks — the common case for state updates. 10,000-stem trie, one-leaf modification + Commit per iteration, Apple M4 Pro: | | before | after | delta | |---|---|---|---| | time / op | 12,653,000 ns | 7,336 ns | **~1,725×** | | bytes / op | 107,224,740 B | 37,774 B | **~2,839×** | | allocs / op | 80,953 | 134 | **~604×** | End-to-end impact on a real workload depends on the resolved-footprint-to-dirty-path ratio; the new `TestBinaryTrieCommitIncremental` provides a structural regression guard (asserts that a Commit following a single-leaf modification flushes a root-to-leaf path, not the whole tree). --- Found all of this stuff while bloating my #34706 DB to make some benchmarks. And saw we were spending A LOT OF TIME on hashing. Hope this helps the perf a bit. Will rebase the flat-state PR on top of this once merged.
2026-07-26 06:36:43 +00:00 · 2026-04-18 11:42:58 +02:00 · 2026-04-18 11:42:58 +02:00 · 61bfacc52f
commit 61bfacc52f
parent 573d94013c
8 changed files with 292 additions and 9 deletions
--- a/trie/bintrie/binary_node.go
+++ b/trie/bintrie/binary_node.go
@ -93,15 +93,15 @@ var invalidSerializedLength = errors.New("invalid serialized node length")
 // DeserializeNode deserializes a binary trie node from a byte slice. The
 // hash will be recomputed from the deserialized data.
 func DeserializeNode(serialized []byte, depth int) (BinaryNode, error) {
-	return deserializeNode(serialized, depth, common.Hash{}, true)
+	return deserializeNode(serialized, depth, common.Hash{}, true, true)
 }

 // DeserializeNodeWithHash deserializes a binary trie node from a byte slice, using the provided hash.
 func DeserializeNodeWithHash(serialized []byte, depth int, hn common.Hash) (BinaryNode, error) {
-	return deserializeNode(serialized, depth, hn, false)
+	return deserializeNode(serialized, depth, hn, false, false)
 }

-func deserializeNode(serialized []byte, depth int, hn common.Hash, mustRecompute bool) (BinaryNode, error) {
+func deserializeNode(serialized []byte, depth int, hn common.Hash, mustRecompute, dirty bool) (BinaryNode, error) {
 	if len(serialized) == 0 {
 		return Empty{}, nil
 	}
@ -117,6 +117,7 @@ func deserializeNode(serialized []byte, depth int, hn common.Hash, mustRecompute
 			right:         HashedNode(common.BytesToHash(serialized[33:65])),
 			hash:          hn,
 			mustRecompute: mustRecompute,
+			dirty:         dirty,
 		}, nil
 	case nodeTypeStem:
 		if len(serialized) < 64 {
@ -141,6 +142,7 @@ func deserializeNode(serialized []byte, depth int, hn common.Hash, mustRecompute
 			depth:         depth,
 			hash:          hn,
 			mustRecompute: mustRecompute,
+			dirty:         dirty,
 		}, nil
 	default:
 		return nil, errors.New("invalid node type")
--- a/trie/bintrie/empty.go
+++ b/trie/bintrie/empty.go
@ -36,6 +36,7 @@ func (e Empty) Insert(key []byte, value []byte, _ NodeResolverFn, depth int) (Bi
 		Values:        values[:],
 		depth:         depth,
 		mustRecompute: true,
+		dirty:         true,
 	}, nil
 }

@ -58,6 +59,7 @@ func (e Empty) InsertValuesAtStem(key []byte, values [][]byte, _ NodeResolverFn,
 		Values:        values,
 		depth:         depth,
 		mustRecompute: true,
+		dirty:         true,
 	}, nil
 }

--- a/trie/bintrie/empty_test.go
+++ b/trie/bintrie/empty_test.go
@ -220,3 +220,45 @@ func TestEmptyGetHeight(t *testing.T) {
 		t.Errorf("Expected height 0 for empty node, got %d", height)
 	}
 }
+
+// TestEmptyInsertMarksDirty verifies that a StemNode produced by Empty.Insert
+// is marked dirty. Without this, CollectNodes would skip the freshly created
+// stem and its blob would never reach disk, producing "missing trie node"
+// errors on subsequent reads.
+func TestEmptyInsertMarksDirty(t *testing.T) {
+	key := make([]byte, 32)
+	key[0] = 0xaa
+	val := make([]byte, 32)
+	val[0] = 0xbb
+	n, err := Empty{}.Insert(key, val, nil, 0)
+	if err != nil {
+		t.Fatalf("Insert: %v", err)
+	}
+	sn, ok := n.(*StemNode)
+	if !ok {
+		t.Fatalf("expected *StemNode, got %T", n)
+	}
+	if !sn.dirty {
+		t.Fatalf("stem produced by Empty.Insert must have dirty=true")
+	}
+}
+
+// TestEmptyInsertValuesAtStemMarksDirty is the analogous guard for the
+// bulk-insert entry point. Fresh stems created here must be dirty.
+func TestEmptyInsertValuesAtStemMarksDirty(t *testing.T) {
+	key := make([]byte, 32)
+	key[0] = 0xcc
+	values := make([][]byte, 256)
+	values[0] = make([]byte, 32)
+	n, err := Empty{}.InsertValuesAtStem(key, values, nil, 3)
+	if err != nil {
+		t.Fatalf("InsertValuesAtStem: %v", err)
+	}
+	sn, ok := n.(*StemNode)
+	if !ok {
+		t.Fatalf("expected *StemNode, got %T", n)
+	}
+	if !sn.dirty {
+		t.Fatalf("stem produced by Empty.InsertValuesAtStem must have dirty=true")
+	}
+}
--- a/trie/bintrie/internal_node.go
+++ b/trie/bintrie/internal_node.go
@ -62,6 +62,7 @@ type InternalNode struct {
 	depth       int

 	mustRecompute bool        // true if the hash needs to be recomputed
+	dirty         bool        // true if the node's on-disk blob is stale (needs flush)
 	hash          common.Hash // cached hash when mustRecompute == false
 }

@ -135,6 +136,7 @@ func (bt *InternalNode) Copy() BinaryNode {
 		right:         bt.right.Copy(),
 		depth:         bt.depth,
 		mustRecompute: bt.mustRecompute,
+		dirty:         bt.dirty,
 		hash:          bt.hash,
 	}
 }
@ -213,6 +215,7 @@ func (bt *InternalNode) InsertValuesAtStem(stem []byte, values [][]byte, resolve

 		bt.left, err = bt.left.InsertValuesAtStem(stem, values, resolver, depth+1)
 		bt.mustRecompute = true
+		bt.dirty = true
 		return bt, err
 	}

@ -238,12 +241,17 @@ func (bt *InternalNode) InsertValuesAtStem(stem []byte, values [][]byte, resolve

 	bt.right, err = bt.right.InsertValuesAtStem(stem, values, resolver, depth+1)
 	bt.mustRecompute = true
+	bt.dirty = true
 	return bt, err
 }

 // CollectNodes collects all child nodes at a given path, and flushes it
-// into the provided node collector.
+// into the provided node collector. Clean subtrees (dirty == false) are
+// skipped.
 func (bt *InternalNode) CollectNodes(path []byte, flushfn NodeFlushFn) error {
+	if !bt.dirty {
+		return nil
+	}
 	if bt.left != nil {
 		var p [256]byte
 		copy(p[:], path)
@ -263,6 +271,7 @@ func (bt *InternalNode) CollectNodes(path []byte, flushfn NodeFlushFn) error {
 		}
 	}
 	flushfn(path, bt)
+	bt.dirty = false
 	return nil
 }

--- a/trie/bintrie/internal_node_test.go
+++ b/trie/bintrie/internal_node_test.go
@ -351,17 +351,20 @@ func TestInternalNodeInsertValuesAtStem(t *testing.T) {

 // TestInternalNodeCollectNodes tests CollectNodes method
 func TestInternalNodeCollectNodes(t *testing.T) {
-	// Create an internal node with two stem children
+	// Create an internal node with two stem children. All three are
+	// marked dirty to mirror production semantics — see CollectNodes.
 	leftStem := &StemNode{
 		Stem:   make([]byte, 31),
 		Values: make([][]byte, 256),
 		depth:  1,
+		dirty:  true,
 	}

 	rightStem := &StemNode{
 		Stem:   make([]byte, 31),
 		Values: make([][]byte, 256),
 		depth:  1,
+		dirty:  true,
 	}
 	rightStem.Stem[0] = 0x80

@ -369,6 +372,7 @@ func TestInternalNodeCollectNodes(t *testing.T) {
 		depth: 0,
 		left:  leftStem,
 		right: rightStem,
+		dirty: true,
 	}

 	var collectedPaths [][]byte
@ -405,6 +409,52 @@ func TestInternalNodeCollectNodes(t *testing.T) {
 	}
 }

+// TestInternalNodeCollectNodesSkipsClean verifies clean subtrees are not
+// flushed. A dirty internal node over clean children only flushes itself;
+// a fully clean tree flushes nothing.
+func TestInternalNodeCollectNodesSkipsClean(t *testing.T) {
+	leftStem := &StemNode{
+		Stem:   make([]byte, 31),
+		Values: make([][]byte, 256),
+		depth:  1,
+	}
+	rightStem := &StemNode{
+		Stem:   make([]byte, 31),
+		Values: make([][]byte, 256),
+		depth:  1,
+	}
+	rightStem.Stem[0] = 0x80
+
+	dirtyParent := &InternalNode{
+		depth: 0,
+		left:  leftStem,
+		right: rightStem,
+		dirty: true,
+	}
+
+	var collected []BinaryNode
+	flushFn := func(_ []byte, n BinaryNode) { collected = append(collected, n) }
+
+	if err := dirtyParent.CollectNodes([]byte{1}, flushFn); err != nil {
+		t.Fatalf("CollectNodes: %v", err)
+	}
+	if len(collected) != 1 || collected[0] != dirtyParent {
+		t.Fatalf("expected only the dirty parent to be flushed, got %d nodes", len(collected))
+	}
+	if dirtyParent.dirty {
+		t.Errorf("parent dirty flag should be cleared after flush")
+	}
+
+	// Second call on the same tree should be a no-op: everything is clean.
+	collected = nil
+	if err := dirtyParent.CollectNodes([]byte{1}, flushFn); err != nil {
+		t.Fatalf("CollectNodes (second call): %v", err)
+	}
+	if len(collected) != 0 {
+		t.Errorf("expected no nodes to be flushed on clean tree, got %d", len(collected))
+	}
+}
+
 // TestInternalNodeGetHeight tests GetHeight method
 func TestInternalNodeGetHeight(t *testing.T) {
 	// Create a tree with different heights
--- a/trie/bintrie/stem_node.go
+++ b/trie/bintrie/stem_node.go
@ -32,6 +32,7 @@ type StemNode struct {
 	depth  int      // Depth of the node

 	mustRecompute bool        // true if the hash needs to be recomputed
+	dirty         bool        // true if the node's on-disk blob is stale (needs flush)
 	hash          common.Hash // cached hash when mustRecompute == false
 }

@ -48,8 +49,11 @@ func (bt *StemNode) Insert(key []byte, value []byte, _ NodeResolverFn, depth int
 	if !bytes.Equal(bt.Stem, key[:StemSize]) {
 		bitStem := bt.Stem[bt.depth/8] >> (7 - (bt.depth % 8)) & 1

-		n := &InternalNode{depth: bt.depth, mustRecompute: true}
+		n := &InternalNode{depth: bt.depth, mustRecompute: true, dirty: true}
 		bt.depth++
+		// bt is re-parented under n and sits at a new path — rewrite its blob.
+		bt.mustRecompute = true
+		bt.dirty = true
 		var child, other *BinaryNode
 		if bitStem == 0 {
 			n.left = bt
@ -77,6 +81,7 @@ func (bt *StemNode) Insert(key []byte, value []byte, _ NodeResolverFn, depth int
 				Values:        values[:],
 				depth:         depth + 1,
 				mustRecompute: true,
+				dirty:         true,
 			}
 		}
 		return n, nil
@ -86,6 +91,7 @@ func (bt *StemNode) Insert(key []byte, value []byte, _ NodeResolverFn, depth int
 	}
 	bt.Values[key[StemSize]] = value
 	bt.mustRecompute = true
+	bt.dirty = true
 	return bt, nil
 }

@ -101,6 +107,7 @@ func (bt *StemNode) Copy() BinaryNode {
 		depth:         bt.depth,
 		hash:          bt.hash,
 		mustRecompute: bt.mustRecompute,
+		dirty:         bt.dirty,
 	}
 }

@ -151,10 +158,14 @@ func (bt *StemNode) Hash() common.Hash {
 	return bt.hash
 }

-// CollectNodes collects all child nodes at a given path, and flushes it
-// into the provided node collector.
+// CollectNodes flushes the stem via the collector when dirty; clean stems
+// are skipped.
 func (bt *StemNode) CollectNodes(path []byte, flush NodeFlushFn) error {
+	if !bt.dirty {
+		return nil
+	}
 	flush(path, bt)
+	bt.dirty = false
 	return nil
 }

@ -172,8 +183,11 @@ func (bt *StemNode) InsertValuesAtStem(key []byte, values [][]byte, _ NodeResolv
 	if !bytes.Equal(bt.Stem, key[:StemSize]) {
 		bitStem := bt.Stem[bt.depth/8] >> (7 - (bt.depth % 8)) & 1

-		n := &InternalNode{depth: bt.depth, mustRecompute: true}
+		n := &InternalNode{depth: bt.depth, mustRecompute: true, dirty: true}
 		bt.depth++
+		// bt is re-parented under n and sits at a new path — rewrite its blob.
+		bt.mustRecompute = true
+		bt.dirty = true
 		var child, other *BinaryNode
 		if bitStem == 0 {
 			n.left = bt
@ -199,6 +213,7 @@ func (bt *StemNode) InsertValuesAtStem(key []byte, values [][]byte, _ NodeResolv
 				Values:        values,
 				depth:         n.depth + 1,
 				mustRecompute: true,
+				dirty:         true,
 			}
 		}
 		return n, nil
@ -209,6 +224,7 @@ func (bt *StemNode) InsertValuesAtStem(key []byte, values [][]byte, _ NodeResolv
 		if v != nil {
 			bt.Values[i] = v
 			bt.mustRecompute = true
+			bt.dirty = true
 		}
 	}
 	return bt, nil
--- a/trie/bintrie/stem_node_test.go
+++ b/trie/bintrie/stem_node_test.go
@ -379,6 +379,7 @@ func TestStemNodeCollectNodes(t *testing.T) {
 		Stem:   stem,
 		Values: values[:],
 		depth:  0,
+		dirty:  true,
 	}

 	var collectedPaths [][]byte
@ -412,3 +413,44 @@ func TestStemNodeCollectNodes(t *testing.T) {
 		t.Errorf("Path mismatch: expected [0, 1, 0], got %v", collectedPaths[0])
 	}
 }
+
+// TestStemNodeCollectNodesSkipsClean verifies that a clean stem is not
+// flushed, and that flushing a dirty stem clears its dirty flag so that
+// a subsequent CollectNodes on the same node is a no-op.
+func TestStemNodeCollectNodesSkipsClean(t *testing.T) {
+	stem := make([]byte, 31)
+	node := &StemNode{
+		Stem:   stem,
+		Values: make([][]byte, 256),
+		depth:  0,
+	}
+
+	var collected []BinaryNode
+	flushFn := func(_ []byte, n BinaryNode) { collected = append(collected, n) }
+
+	if err := node.CollectNodes([]byte{0}, flushFn); err != nil {
+		t.Fatalf("CollectNodes on clean stem: %v", err)
+	}
+	if len(collected) != 0 {
+		t.Fatalf("expected clean stem not to be flushed, got %d", len(collected))
+	}
+
+	node.dirty = true
+	if err := node.CollectNodes([]byte{0}, flushFn); err != nil {
+		t.Fatalf("CollectNodes on dirty stem: %v", err)
+	}
+	if len(collected) != 1 {
+		t.Fatalf("expected dirty stem to be flushed once, got %d", len(collected))
+	}
+	if node.dirty {
+		t.Errorf("stem dirty flag should be cleared after flush")
+	}
+
+	collected = nil
+	if err := node.CollectNodes([]byte{0}, flushFn); err != nil {
+		t.Fatalf("CollectNodes after flush: %v", err)
+	}
+	if len(collected) != 0 {
+		t.Errorf("expected no flush on clean stem, got %d", len(collected))
+	}
+}
--- a/trie/bintrie/trie_test.go
+++ b/trie/bintrie/trie_test.go
@ -779,3 +779,123 @@ func TestGetStorageNonMembershipInternalRoot(t *testing.T) {
 		t.Fatalf("expected nil/zero for non-existent storage, got %x", got)
 	}
 }
+
+// commitKeyN derives a distinct 32-byte key from a seed integer. Used by
+// TestBinaryTrieCommitIncremental and BenchmarkCollectNodes_SparseWrite to
+// populate a trie with many disjoint stems.
+func commitKeyN(i int) [HashSize]byte {
+	var k [HashSize]byte
+	binary.BigEndian.PutUint64(k[:8], uint64(i)*0x9e3779b97f4a7c15)
+	binary.BigEndian.PutUint64(k[8:16], uint64(i)*0xc2b2ae3d27d4eb4f)
+	binary.BigEndian.PutUint64(k[16:24], uint64(i)*0x165667b19e3779f9)
+	binary.BigEndian.PutUint64(k[24:32], uint64(i)*0x85ebca77c2b2ae63)
+	return k
+}
+
+// TestBinaryTrieCommitIncremental verifies that a second Commit with only a
+// single modified leaf flushes only the path from that leaf to the root,
+// not the entire tree.
+func TestBinaryTrieCommitIncremental(t *testing.T) {
+	tr := &BinaryTrie{
+		root:   NewBinaryNode(),
+		tracer: trie.NewPrevalueTracer(),
+	}
+
+	const n = 512
+	keys := make([][HashSize]byte, n)
+	for i := range n {
+		keys[i] = commitKeyN(i + 1)
+		var v [HashSize]byte
+		binary.BigEndian.PutUint64(v[24:], uint64(i+1))
+		var err error
+		tr.root, err = tr.root.Insert(keys[i][:], v[:], nil, 0)
+		if err != nil {
+			t.Fatalf("Insert %d: %v", i, err)
+		}
+	}
+
+	_, ns1 := tr.Commit(false)
+	if len(ns1.Nodes) == 0 {
+		t.Fatal("first Commit produced empty NodeSet")
+	}
+	if len(ns1.Nodes) < n {
+		t.Fatalf("first Commit: expected at least %d nodes, got %d", n, len(ns1.Nodes))
+	}
+
+	// Second Commit on the same trie with no modifications: NodeSet must
+	// be empty because every subtree is clean.
+	_, nsNoop := tr.Commit(false)
+	if len(nsNoop.Nodes) != 0 {
+		t.Fatalf("no-op Commit: expected empty NodeSet, got %d nodes", len(nsNoop.Nodes))
+	}
+
+	// Modify a single leaf's value. Only the path from that leaf to the
+	// root should appear in the next Commit's NodeSet.
+	var newVal [HashSize]byte
+	newVal[0] = 0xff
+	var err error
+	tr.root, err = tr.root.Insert(keys[n/2][:], newVal[:], nil, 0)
+	if err != nil {
+		t.Fatalf("Insert (modify): %v", err)
+	}
+	_, ns2 := tr.Commit(false)
+
+	// Path length for a binary trie of n=512 stems is bounded by the
+	// internal depth at which the modified stem sits. Allow generous
+	// slack: up to 64 nodes is fine, anywhere near n (512) is a regression.
+	if len(ns2.Nodes) == 0 {
+		t.Fatal("modified Commit produced empty NodeSet")
+	}
+	if len(ns2.Nodes) > 64 {
+		t.Fatalf("modified Commit: expected small NodeSet, got %d nodes (first Commit had %d)", len(ns2.Nodes), len(ns1.Nodes))
+	}
+	if len(ns2.Nodes) >= len(ns1.Nodes) {
+		t.Fatalf("expected second NodeSet (%d) to be smaller than first (%d)", len(ns2.Nodes), len(ns1.Nodes))
+	}
+}
+
+// BenchmarkCollectNodes_SparseWrite measures Commit cost when only one leaf
+// changes between blocks — the common case for state updates. After warm-up
+// (populate + initial Commit), each iteration modifies a single leaf and
+// re-Commits. Under the skip-clean optimization, each iteration flushes
+// only the root-to-leaf path; pre-fix behavior would re-flush the entire
+// tree every iteration.
+func BenchmarkCollectNodes_SparseWrite(b *testing.B) {
+	const n = 10_000
+
+	tr := &BinaryTrie{
+		root:   NewBinaryNode(),
+		tracer: trie.NewPrevalueTracer(),
+	}
+	keys := make([][HashSize]byte, n)
+	for i := range n {
+		keys[i] = commitKeyN(i + 1)
+		var v [HashSize]byte
+		binary.BigEndian.PutUint64(v[24:], uint64(i+1))
+		var err error
+		tr.root, err = tr.root.Insert(keys[i][:], v[:], nil, 0)
+		if err != nil {
+			b.Fatalf("Insert %d: %v", i, err)
+		}
+	}
+	// Flush the initial tree so subsequent Commits reflect the
+	// single-modification workload we want to measure.
+	_, _ = tr.Commit(false)
+
+	var newVal [HashSize]byte
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		idx := i % n
+		binary.BigEndian.PutUint64(newVal[24:], uint64(i+1))
+		var err error
+		tr.root, err = tr.root.Insert(keys[idx][:], newVal[:], nil, 0)
+		if err != nil {
+			b.Fatalf("Insert at iter %d: %v", i, err)
+		}
+		_, ns := tr.Commit(false)
+		if len(ns.Nodes) == 0 {
+			b.Fatalf("iter %d: empty NodeSet", i)
+		}
+	}
+}