From fad11d579551d184f2f7ecfe051aa70ab204d368 Mon Sep 17 00:00:00 2001 From: CPerezz Date: Fri, 17 Apr 2026 15:54:16 +0200 Subject: [PATCH] trie/bintrie: skip clean nodes in CollectNodes to reduce commit write amplification MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BinaryTrie.Commit unconditionally walked every resolved in-memory node and flushed it into the NodeSet, producing one Pebble write per resolved internal + stem node on every block — even when the node's on-disk blob was bitwise identical to the previous commit. On a warm 400M-state workload this meant tens of thousands of redundant 65-byte writes per block, compounding Pebble compaction pressure on every commit. The existing mustRecompute flag tracks hash staleness, not disk-blob staleness: after Hash() completes, mustRecompute is cleared even though the fresh blob has not been persisted. It is therefore insufficient for a skip-flush optimization. This change mirrors MPT's committer pattern (trie/committer.go:51-56) by adding a dirty flag on InternalNode and StemNode with the semantics "the on-disk blob is stale". The flag is: - set to true wherever the node is created or structurally modified (the same call sites that already set mustRecompute = true), - set to false only after the node has been passed to the flushfn inside CollectNodes, - left false on nodes produced by DeserializeNodeWithHash, matching the "loaded from disk, already persisted" semantics. CollectNodes short-circuits on !dirty subtrees; the propagation invariant (an ancestor of any dirty node is itself dirty) is already maintained by the existing InsertValuesAtStem / Insert paths, which now mirror every mustRecompute = true setter with a dirty = true setter. Serialization format, hash computation, state root, and the pathdb write path are untouched. Empty NodeSets are already tolerated by triedb/pathdb.writeNodes. BenchmarkCollectNodes_SparseWrite (10,000-stem trie, one-leaf modification + Commit per iteration, Apple M4 Pro): before 12,653,000 ns/op 107,224,740 B/op 80,953 allocs/op after 7,336 ns/op 37,774 B/op 134 allocs/op speedup: ~1,725x memory: ~2,839x less allocs: ~604x fewer End-to-end impact on a benchmarked geth build depends on workload; the new TestBinaryTrieCommitIncremental provides a structural regression guard. --- trie/bintrie/binary_node.go | 8 +- trie/bintrie/internal_node.go | 11 ++- trie/bintrie/internal_node_test.go | 52 ++++++++++++- trie/bintrie/stem_node.go | 24 +++++- trie/bintrie/stem_node_test.go | 42 ++++++++++ trie/bintrie/trie_test.go | 120 +++++++++++++++++++++++++++++ 6 files changed, 248 insertions(+), 9 deletions(-) diff --git a/trie/bintrie/binary_node.go b/trie/bintrie/binary_node.go index a7392ec958..8905a82285 100644 --- a/trie/bintrie/binary_node.go +++ b/trie/bintrie/binary_node.go @@ -93,15 +93,15 @@ var invalidSerializedLength = errors.New("invalid serialized node length") // DeserializeNode deserializes a binary trie node from a byte slice. The // hash will be recomputed from the deserialized data. func DeserializeNode(serialized []byte, depth int) (BinaryNode, error) { - return deserializeNode(serialized, depth, common.Hash{}, true) + return deserializeNode(serialized, depth, common.Hash{}, true, true) } // DeserializeNodeWithHash deserializes a binary trie node from a byte slice, using the provided hash. func DeserializeNodeWithHash(serialized []byte, depth int, hn common.Hash) (BinaryNode, error) { - return deserializeNode(serialized, depth, hn, false) + return deserializeNode(serialized, depth, hn, false, false) } -func deserializeNode(serialized []byte, depth int, hn common.Hash, mustRecompute bool) (BinaryNode, error) { +func deserializeNode(serialized []byte, depth int, hn common.Hash, mustRecompute, dirty bool) (BinaryNode, error) { if len(serialized) == 0 { return Empty{}, nil } @@ -117,6 +117,7 @@ func deserializeNode(serialized []byte, depth int, hn common.Hash, mustRecompute right: HashedNode(common.BytesToHash(serialized[33:65])), hash: hn, mustRecompute: mustRecompute, + dirty: dirty, }, nil case nodeTypeStem: if len(serialized) < 64 { @@ -141,6 +142,7 @@ func deserializeNode(serialized []byte, depth int, hn common.Hash, mustRecompute depth: depth, hash: hn, mustRecompute: mustRecompute, + dirty: dirty, }, nil default: return nil, errors.New("invalid node type") diff --git a/trie/bintrie/internal_node.go b/trie/bintrie/internal_node.go index 946203bcfb..811f65bcd8 100644 --- a/trie/bintrie/internal_node.go +++ b/trie/bintrie/internal_node.go @@ -62,6 +62,7 @@ type InternalNode struct { depth int mustRecompute bool // true if the hash needs to be recomputed + dirty bool // true if the node's on-disk blob is stale (needs flush) hash common.Hash // cached hash when mustRecompute == false } @@ -135,6 +136,7 @@ func (bt *InternalNode) Copy() BinaryNode { right: bt.right.Copy(), depth: bt.depth, mustRecompute: bt.mustRecompute, + dirty: bt.dirty, hash: bt.hash, } } @@ -213,6 +215,7 @@ func (bt *InternalNode) InsertValuesAtStem(stem []byte, values [][]byte, resolve bt.left, err = bt.left.InsertValuesAtStem(stem, values, resolver, depth+1) bt.mustRecompute = true + bt.dirty = true return bt, err } @@ -238,12 +241,17 @@ func (bt *InternalNode) InsertValuesAtStem(stem []byte, values [][]byte, resolve bt.right, err = bt.right.InsertValuesAtStem(stem, values, resolver, depth+1) bt.mustRecompute = true + bt.dirty = true return bt, err } // CollectNodes collects all child nodes at a given path, and flushes it -// into the provided node collector. +// into the provided node collector. Clean subtrees (dirty == false) are +// skipped. func (bt *InternalNode) CollectNodes(path []byte, flushfn NodeFlushFn) error { + if !bt.dirty { + return nil + } if bt.left != nil { var p [256]byte copy(p[:], path) @@ -263,6 +271,7 @@ func (bt *InternalNode) CollectNodes(path []byte, flushfn NodeFlushFn) error { } } flushfn(path, bt) + bt.dirty = false return nil } diff --git a/trie/bintrie/internal_node_test.go b/trie/bintrie/internal_node_test.go index 69097483fd..ddcec8085d 100644 --- a/trie/bintrie/internal_node_test.go +++ b/trie/bintrie/internal_node_test.go @@ -351,17 +351,20 @@ func TestInternalNodeInsertValuesAtStem(t *testing.T) { // TestInternalNodeCollectNodes tests CollectNodes method func TestInternalNodeCollectNodes(t *testing.T) { - // Create an internal node with two stem children + // Create an internal node with two stem children. All three are + // marked dirty to mirror production semantics — see CollectNodes. leftStem := &StemNode{ Stem: make([]byte, 31), Values: make([][]byte, 256), depth: 1, + dirty: true, } rightStem := &StemNode{ Stem: make([]byte, 31), Values: make([][]byte, 256), depth: 1, + dirty: true, } rightStem.Stem[0] = 0x80 @@ -369,6 +372,7 @@ func TestInternalNodeCollectNodes(t *testing.T) { depth: 0, left: leftStem, right: rightStem, + dirty: true, } var collectedPaths [][]byte @@ -405,6 +409,52 @@ func TestInternalNodeCollectNodes(t *testing.T) { } } +// TestInternalNodeCollectNodesSkipsClean verifies clean subtrees are not +// flushed. A dirty internal node over clean children only flushes itself; +// a fully clean tree flushes nothing. +func TestInternalNodeCollectNodesSkipsClean(t *testing.T) { + leftStem := &StemNode{ + Stem: make([]byte, 31), + Values: make([][]byte, 256), + depth: 1, + } + rightStem := &StemNode{ + Stem: make([]byte, 31), + Values: make([][]byte, 256), + depth: 1, + } + rightStem.Stem[0] = 0x80 + + dirtyParent := &InternalNode{ + depth: 0, + left: leftStem, + right: rightStem, + dirty: true, + } + + var collected []BinaryNode + flushFn := func(_ []byte, n BinaryNode) { collected = append(collected, n) } + + if err := dirtyParent.CollectNodes([]byte{1}, flushFn); err != nil { + t.Fatalf("CollectNodes: %v", err) + } + if len(collected) != 1 || collected[0] != dirtyParent { + t.Fatalf("expected only the dirty parent to be flushed, got %d nodes", len(collected)) + } + if dirtyParent.dirty { + t.Errorf("parent dirty flag should be cleared after flush") + } + + // Second call on the same tree should be a no-op: everything is clean. + collected = nil + if err := dirtyParent.CollectNodes([]byte{1}, flushFn); err != nil { + t.Fatalf("CollectNodes (second call): %v", err) + } + if len(collected) != 0 { + t.Errorf("expected no nodes to be flushed on clean tree, got %d", len(collected)) + } +} + // TestInternalNodeGetHeight tests GetHeight method func TestInternalNodeGetHeight(t *testing.T) { // Create a tree with different heights diff --git a/trie/bintrie/stem_node.go b/trie/bintrie/stem_node.go index e5729e6182..0ceae6b062 100644 --- a/trie/bintrie/stem_node.go +++ b/trie/bintrie/stem_node.go @@ -32,6 +32,7 @@ type StemNode struct { depth int // Depth of the node mustRecompute bool // true if the hash needs to be recomputed + dirty bool // true if the node's on-disk blob is stale (needs flush) hash common.Hash // cached hash when mustRecompute == false } @@ -48,8 +49,11 @@ func (bt *StemNode) Insert(key []byte, value []byte, _ NodeResolverFn, depth int if !bytes.Equal(bt.Stem, key[:StemSize]) { bitStem := bt.Stem[bt.depth/8] >> (7 - (bt.depth % 8)) & 1 - n := &InternalNode{depth: bt.depth, mustRecompute: true} + n := &InternalNode{depth: bt.depth, mustRecompute: true, dirty: true} bt.depth++ + // bt is re-parented under n and sits at a new path — rewrite its blob. + bt.mustRecompute = true + bt.dirty = true var child, other *BinaryNode if bitStem == 0 { n.left = bt @@ -77,6 +81,7 @@ func (bt *StemNode) Insert(key []byte, value []byte, _ NodeResolverFn, depth int Values: values[:], depth: depth + 1, mustRecompute: true, + dirty: true, } } return n, nil @@ -86,6 +91,7 @@ func (bt *StemNode) Insert(key []byte, value []byte, _ NodeResolverFn, depth int } bt.Values[key[StemSize]] = value bt.mustRecompute = true + bt.dirty = true return bt, nil } @@ -101,6 +107,7 @@ func (bt *StemNode) Copy() BinaryNode { depth: bt.depth, hash: bt.hash, mustRecompute: bt.mustRecompute, + dirty: bt.dirty, } } @@ -151,10 +158,14 @@ func (bt *StemNode) Hash() common.Hash { return bt.hash } -// CollectNodes collects all child nodes at a given path, and flushes it -// into the provided node collector. +// CollectNodes flushes the stem via the collector when dirty; clean stems +// are skipped. func (bt *StemNode) CollectNodes(path []byte, flush NodeFlushFn) error { + if !bt.dirty { + return nil + } flush(path, bt) + bt.dirty = false return nil } @@ -172,8 +183,11 @@ func (bt *StemNode) InsertValuesAtStem(key []byte, values [][]byte, _ NodeResolv if !bytes.Equal(bt.Stem, key[:StemSize]) { bitStem := bt.Stem[bt.depth/8] >> (7 - (bt.depth % 8)) & 1 - n := &InternalNode{depth: bt.depth, mustRecompute: true} + n := &InternalNode{depth: bt.depth, mustRecompute: true, dirty: true} bt.depth++ + // bt is re-parented under n and sits at a new path — rewrite its blob. + bt.mustRecompute = true + bt.dirty = true var child, other *BinaryNode if bitStem == 0 { n.left = bt @@ -199,6 +213,7 @@ func (bt *StemNode) InsertValuesAtStem(key []byte, values [][]byte, _ NodeResolv Values: values, depth: n.depth + 1, mustRecompute: true, + dirty: true, } } return n, nil @@ -209,6 +224,7 @@ func (bt *StemNode) InsertValuesAtStem(key []byte, values [][]byte, _ NodeResolv if v != nil { bt.Values[i] = v bt.mustRecompute = true + bt.dirty = true } } return bt, nil diff --git a/trie/bintrie/stem_node_test.go b/trie/bintrie/stem_node_test.go index 310c553d39..2743e7ce9b 100644 --- a/trie/bintrie/stem_node_test.go +++ b/trie/bintrie/stem_node_test.go @@ -379,6 +379,7 @@ func TestStemNodeCollectNodes(t *testing.T) { Stem: stem, Values: values[:], depth: 0, + dirty: true, } var collectedPaths [][]byte @@ -412,3 +413,44 @@ func TestStemNodeCollectNodes(t *testing.T) { t.Errorf("Path mismatch: expected [0, 1, 0], got %v", collectedPaths[0]) } } + +// TestStemNodeCollectNodesSkipsClean verifies that a clean stem is not +// flushed, and that flushing a dirty stem clears its dirty flag so that +// a subsequent CollectNodes on the same node is a no-op. +func TestStemNodeCollectNodesSkipsClean(t *testing.T) { + stem := make([]byte, 31) + node := &StemNode{ + Stem: stem, + Values: make([][]byte, 256), + depth: 0, + } + + var collected []BinaryNode + flushFn := func(_ []byte, n BinaryNode) { collected = append(collected, n) } + + if err := node.CollectNodes([]byte{0}, flushFn); err != nil { + t.Fatalf("CollectNodes on clean stem: %v", err) + } + if len(collected) != 0 { + t.Fatalf("expected clean stem not to be flushed, got %d", len(collected)) + } + + node.dirty = true + if err := node.CollectNodes([]byte{0}, flushFn); err != nil { + t.Fatalf("CollectNodes on dirty stem: %v", err) + } + if len(collected) != 1 { + t.Fatalf("expected dirty stem to be flushed once, got %d", len(collected)) + } + if node.dirty { + t.Errorf("stem dirty flag should be cleared after flush") + } + + collected = nil + if err := node.CollectNodes([]byte{0}, flushFn); err != nil { + t.Fatalf("CollectNodes after flush: %v", err) + } + if len(collected) != 0 { + t.Errorf("expected no flush on clean stem, got %d", len(collected)) + } +} diff --git a/trie/bintrie/trie_test.go b/trie/bintrie/trie_test.go index 5b104ddde4..c436501464 100644 --- a/trie/bintrie/trie_test.go +++ b/trie/bintrie/trie_test.go @@ -779,3 +779,123 @@ func TestGetStorageNonMembershipInternalRoot(t *testing.T) { t.Fatalf("expected nil/zero for non-existent storage, got %x", got) } } + +// commitKeyN derives a distinct 32-byte key from a seed integer. Used by +// TestBinaryTrieCommitIncremental and BenchmarkCollectNodes_SparseWrite to +// populate a trie with many disjoint stems. +func commitKeyN(i int) [HashSize]byte { + var k [HashSize]byte + binary.BigEndian.PutUint64(k[:8], uint64(i)*0x9e3779b97f4a7c15) + binary.BigEndian.PutUint64(k[8:16], uint64(i)*0xc2b2ae3d27d4eb4f) + binary.BigEndian.PutUint64(k[16:24], uint64(i)*0x165667b19e3779f9) + binary.BigEndian.PutUint64(k[24:32], uint64(i)*0x85ebca77c2b2ae63) + return k +} + +// TestBinaryTrieCommitIncremental verifies that a second Commit with only a +// single modified leaf flushes only the path from that leaf to the root, +// not the entire tree. +func TestBinaryTrieCommitIncremental(t *testing.T) { + tr := &BinaryTrie{ + root: NewBinaryNode(), + tracer: trie.NewPrevalueTracer(), + } + + const n = 512 + keys := make([][HashSize]byte, n) + for i := range n { + keys[i] = commitKeyN(i + 1) + var v [HashSize]byte + binary.BigEndian.PutUint64(v[24:], uint64(i+1)) + var err error + tr.root, err = tr.root.Insert(keys[i][:], v[:], nil, 0) + if err != nil { + t.Fatalf("Insert %d: %v", i, err) + } + } + + _, ns1 := tr.Commit(false) + if len(ns1.Nodes) == 0 { + t.Fatal("first Commit produced empty NodeSet") + } + if len(ns1.Nodes) < n { + t.Fatalf("first Commit: expected at least %d nodes, got %d", n, len(ns1.Nodes)) + } + + // Second Commit on the same trie with no modifications: NodeSet must + // be empty because every subtree is clean. + _, nsNoop := tr.Commit(false) + if len(nsNoop.Nodes) != 0 { + t.Fatalf("no-op Commit: expected empty NodeSet, got %d nodes", len(nsNoop.Nodes)) + } + + // Modify a single leaf's value. Only the path from that leaf to the + // root should appear in the next Commit's NodeSet. + var newVal [HashSize]byte + newVal[0] = 0xff + var err error + tr.root, err = tr.root.Insert(keys[n/2][:], newVal[:], nil, 0) + if err != nil { + t.Fatalf("Insert (modify): %v", err) + } + _, ns2 := tr.Commit(false) + + // Path length for a binary trie of n=512 stems is bounded by the + // internal depth at which the modified stem sits. Allow generous + // slack: up to 64 nodes is fine, anywhere near n (512) is a regression. + if len(ns2.Nodes) == 0 { + t.Fatal("modified Commit produced empty NodeSet") + } + if len(ns2.Nodes) > 64 { + t.Fatalf("modified Commit: expected small NodeSet, got %d nodes (first Commit had %d)", len(ns2.Nodes), len(ns1.Nodes)) + } + if len(ns2.Nodes) >= len(ns1.Nodes) { + t.Fatalf("expected second NodeSet (%d) to be smaller than first (%d)", len(ns2.Nodes), len(ns1.Nodes)) + } +} + +// BenchmarkCollectNodes_SparseWrite measures Commit cost when only one leaf +// changes between blocks — the common case for state updates. After warm-up +// (populate + initial Commit), each iteration modifies a single leaf and +// re-Commits. Under the skip-clean optimization, each iteration flushes +// only the root-to-leaf path; pre-fix behavior would re-flush the entire +// tree every iteration. +func BenchmarkCollectNodes_SparseWrite(b *testing.B) { + const n = 10_000 + + tr := &BinaryTrie{ + root: NewBinaryNode(), + tracer: trie.NewPrevalueTracer(), + } + keys := make([][HashSize]byte, n) + for i := range n { + keys[i] = commitKeyN(i + 1) + var v [HashSize]byte + binary.BigEndian.PutUint64(v[24:], uint64(i+1)) + var err error + tr.root, err = tr.root.Insert(keys[i][:], v[:], nil, 0) + if err != nil { + b.Fatalf("Insert %d: %v", i, err) + } + } + // Flush the initial tree so subsequent Commits reflect the + // single-modification workload we want to measure. + _, _ = tr.Commit(false) + + var newVal [HashSize]byte + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + idx := i % n + binary.BigEndian.PutUint64(newVal[24:], uint64(i+1)) + var err error + tr.root, err = tr.root.Insert(keys[idx][:], newVal[:], nil, 0) + if err != nil { + b.Fatalf("Insert at iter %d: %v", i, err) + } + _, ns := tr.Commit(false) + if len(ns.Nodes) == 0 { + b.Fatalf("iter %d: empty NodeSet", i) + } + } +}