trie/bintrie: skip clean nodes in CollectNodes to reduce commit write amplification (#34754)
Some checks are pending
/ Linux Build (push) Waiting to run
/ Linux Build (arm) (push) Waiting to run
/ Keeper Build (push) Waiting to run
/ Windows Build (push) Waiting to run
/ Docker Image (push) Waiting to run

## Problem

`BinaryTrie.Commit` unconditionally walked every resolved in-memory node
and flushed it into the `NodeSet`, producing one Pebble write per
resolved internal + stem node on every block — even when the node's
on-disk blob was bitwise identical to the previous commit. On a warm
400M-state workload this meant tens of thousands of redundant 65-byte
writes per block, compounding Pebble compaction pressure on every
commit.

The existing `mustRecompute` flag tracks *hash* staleness, not
*disk-blob* staleness: after `Hash()` completes, `mustRecompute` is
cleared even though the fresh blob has not been persisted. It is
therefore insufficient for a skip-flush optimization.

## Fix

Mirror the MPT committer pattern (`trie/committer.go:51-56`) by adding a
`dirty` flag on `InternalNode` and `StemNode` with the semantics *the
on-disk blob is stale*. The flag is:

- set to `true` wherever the node is created or structurally modified
(the same call sites that already set `mustRecompute = true`);
- set to `false` only after the node has been passed to the `flushfn`
inside `CollectNodes`;
- left `false` on nodes produced by `DeserializeNodeWithHash`, matching
the *loaded from disk, already persisted* semantics.

`CollectNodes` short-circuits on `!dirty` subtrees. The propagation
invariant (an ancestor of any dirty node is itself dirty) is already
maintained by the existing `InsertValuesAtStem` / `Insert` paths, which
now mirror every `mustRecompute = true` setter with a `dirty = true`
setter.

## Benchmark

New `BenchmarkCollectNodes_SparseWrite` measures commit cost when only
one leaf changes between blocks — the common case for state updates.
10,000-stem trie, one-leaf modification + Commit per iteration, Apple M4
Pro:

| | before | after | delta |
|---|---|---|---|
| time / op | 12,653,000 ns | 7,336 ns | **~1,725×** |
| bytes / op | 107,224,740 B | 37,774 B | **~2,839×** |
| allocs / op | 80,953 | 134 | **~604×** |

End-to-end impact on a real workload depends on the
resolved-footprint-to-dirty-path ratio; the new
`TestBinaryTrieCommitIncremental` provides a structural regression guard
(asserts that a Commit following a single-leaf modification flushes a
root-to-leaf path, not the whole tree).

---

Found all of this stuff while bloating my #34706 DB to make some
benchmarks. And saw we were spending A LOT OF TIME on hashing.
Hope this helps the perf a bit. Will rebase the flat-state PR on top of
this once merged.
This commit is contained in:
CPerezz 2026-04-18 11:42:58 +02:00 committed by GitHub
parent 573d94013c
commit 61bfacc52f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 292 additions and 9 deletions

View file

@ -93,15 +93,15 @@ var invalidSerializedLength = errors.New("invalid serialized node length")
// DeserializeNode deserializes a binary trie node from a byte slice. The
// hash will be recomputed from the deserialized data.
func DeserializeNode(serialized []byte, depth int) (BinaryNode, error) {
return deserializeNode(serialized, depth, common.Hash{}, true)
return deserializeNode(serialized, depth, common.Hash{}, true, true)
}
// DeserializeNodeWithHash deserializes a binary trie node from a byte slice, using the provided hash.
func DeserializeNodeWithHash(serialized []byte, depth int, hn common.Hash) (BinaryNode, error) {
return deserializeNode(serialized, depth, hn, false)
return deserializeNode(serialized, depth, hn, false, false)
}
func deserializeNode(serialized []byte, depth int, hn common.Hash, mustRecompute bool) (BinaryNode, error) {
func deserializeNode(serialized []byte, depth int, hn common.Hash, mustRecompute, dirty bool) (BinaryNode, error) {
if len(serialized) == 0 {
return Empty{}, nil
}
@ -117,6 +117,7 @@ func deserializeNode(serialized []byte, depth int, hn common.Hash, mustRecompute
right: HashedNode(common.BytesToHash(serialized[33:65])),
hash: hn,
mustRecompute: mustRecompute,
dirty: dirty,
}, nil
case nodeTypeStem:
if len(serialized) < 64 {
@ -141,6 +142,7 @@ func deserializeNode(serialized []byte, depth int, hn common.Hash, mustRecompute
depth: depth,
hash: hn,
mustRecompute: mustRecompute,
dirty: dirty,
}, nil
default:
return nil, errors.New("invalid node type")

View file

@ -36,6 +36,7 @@ func (e Empty) Insert(key []byte, value []byte, _ NodeResolverFn, depth int) (Bi
Values: values[:],
depth: depth,
mustRecompute: true,
dirty: true,
}, nil
}
@ -58,6 +59,7 @@ func (e Empty) InsertValuesAtStem(key []byte, values [][]byte, _ NodeResolverFn,
Values: values,
depth: depth,
mustRecompute: true,
dirty: true,
}, nil
}

View file

@ -220,3 +220,45 @@ func TestEmptyGetHeight(t *testing.T) {
t.Errorf("Expected height 0 for empty node, got %d", height)
}
}
// TestEmptyInsertMarksDirty verifies that a StemNode produced by Empty.Insert
// is marked dirty. Without this, CollectNodes would skip the freshly created
// stem and its blob would never reach disk, producing "missing trie node"
// errors on subsequent reads.
func TestEmptyInsertMarksDirty(t *testing.T) {
key := make([]byte, 32)
key[0] = 0xaa
val := make([]byte, 32)
val[0] = 0xbb
n, err := Empty{}.Insert(key, val, nil, 0)
if err != nil {
t.Fatalf("Insert: %v", err)
}
sn, ok := n.(*StemNode)
if !ok {
t.Fatalf("expected *StemNode, got %T", n)
}
if !sn.dirty {
t.Fatalf("stem produced by Empty.Insert must have dirty=true")
}
}
// TestEmptyInsertValuesAtStemMarksDirty is the analogous guard for the
// bulk-insert entry point. Fresh stems created here must be dirty.
func TestEmptyInsertValuesAtStemMarksDirty(t *testing.T) {
key := make([]byte, 32)
key[0] = 0xcc
values := make([][]byte, 256)
values[0] = make([]byte, 32)
n, err := Empty{}.InsertValuesAtStem(key, values, nil, 3)
if err != nil {
t.Fatalf("InsertValuesAtStem: %v", err)
}
sn, ok := n.(*StemNode)
if !ok {
t.Fatalf("expected *StemNode, got %T", n)
}
if !sn.dirty {
t.Fatalf("stem produced by Empty.InsertValuesAtStem must have dirty=true")
}
}

View file

@ -62,6 +62,7 @@ type InternalNode struct {
depth int
mustRecompute bool // true if the hash needs to be recomputed
dirty bool // true if the node's on-disk blob is stale (needs flush)
hash common.Hash // cached hash when mustRecompute == false
}
@ -135,6 +136,7 @@ func (bt *InternalNode) Copy() BinaryNode {
right: bt.right.Copy(),
depth: bt.depth,
mustRecompute: bt.mustRecompute,
dirty: bt.dirty,
hash: bt.hash,
}
}
@ -213,6 +215,7 @@ func (bt *InternalNode) InsertValuesAtStem(stem []byte, values [][]byte, resolve
bt.left, err = bt.left.InsertValuesAtStem(stem, values, resolver, depth+1)
bt.mustRecompute = true
bt.dirty = true
return bt, err
}
@ -238,12 +241,17 @@ func (bt *InternalNode) InsertValuesAtStem(stem []byte, values [][]byte, resolve
bt.right, err = bt.right.InsertValuesAtStem(stem, values, resolver, depth+1)
bt.mustRecompute = true
bt.dirty = true
return bt, err
}
// CollectNodes collects all child nodes at a given path, and flushes it
// into the provided node collector.
// into the provided node collector. Clean subtrees (dirty == false) are
// skipped.
func (bt *InternalNode) CollectNodes(path []byte, flushfn NodeFlushFn) error {
if !bt.dirty {
return nil
}
if bt.left != nil {
var p [256]byte
copy(p[:], path)
@ -263,6 +271,7 @@ func (bt *InternalNode) CollectNodes(path []byte, flushfn NodeFlushFn) error {
}
}
flushfn(path, bt)
bt.dirty = false
return nil
}

View file

@ -351,17 +351,20 @@ func TestInternalNodeInsertValuesAtStem(t *testing.T) {
// TestInternalNodeCollectNodes tests CollectNodes method
func TestInternalNodeCollectNodes(t *testing.T) {
// Create an internal node with two stem children
// Create an internal node with two stem children. All three are
// marked dirty to mirror production semantics — see CollectNodes.
leftStem := &StemNode{
Stem: make([]byte, 31),
Values: make([][]byte, 256),
depth: 1,
dirty: true,
}
rightStem := &StemNode{
Stem: make([]byte, 31),
Values: make([][]byte, 256),
depth: 1,
dirty: true,
}
rightStem.Stem[0] = 0x80
@ -369,6 +372,7 @@ func TestInternalNodeCollectNodes(t *testing.T) {
depth: 0,
left: leftStem,
right: rightStem,
dirty: true,
}
var collectedPaths [][]byte
@ -405,6 +409,52 @@ func TestInternalNodeCollectNodes(t *testing.T) {
}
}
// TestInternalNodeCollectNodesSkipsClean verifies clean subtrees are not
// flushed. A dirty internal node over clean children only flushes itself;
// a fully clean tree flushes nothing.
func TestInternalNodeCollectNodesSkipsClean(t *testing.T) {
leftStem := &StemNode{
Stem: make([]byte, 31),
Values: make([][]byte, 256),
depth: 1,
}
rightStem := &StemNode{
Stem: make([]byte, 31),
Values: make([][]byte, 256),
depth: 1,
}
rightStem.Stem[0] = 0x80
dirtyParent := &InternalNode{
depth: 0,
left: leftStem,
right: rightStem,
dirty: true,
}
var collected []BinaryNode
flushFn := func(_ []byte, n BinaryNode) { collected = append(collected, n) }
if err := dirtyParent.CollectNodes([]byte{1}, flushFn); err != nil {
t.Fatalf("CollectNodes: %v", err)
}
if len(collected) != 1 || collected[0] != dirtyParent {
t.Fatalf("expected only the dirty parent to be flushed, got %d nodes", len(collected))
}
if dirtyParent.dirty {
t.Errorf("parent dirty flag should be cleared after flush")
}
// Second call on the same tree should be a no-op: everything is clean.
collected = nil
if err := dirtyParent.CollectNodes([]byte{1}, flushFn); err != nil {
t.Fatalf("CollectNodes (second call): %v", err)
}
if len(collected) != 0 {
t.Errorf("expected no nodes to be flushed on clean tree, got %d", len(collected))
}
}
// TestInternalNodeGetHeight tests GetHeight method
func TestInternalNodeGetHeight(t *testing.T) {
// Create a tree with different heights

View file

@ -32,6 +32,7 @@ type StemNode struct {
depth int // Depth of the node
mustRecompute bool // true if the hash needs to be recomputed
dirty bool // true if the node's on-disk blob is stale (needs flush)
hash common.Hash // cached hash when mustRecompute == false
}
@ -48,8 +49,11 @@ func (bt *StemNode) Insert(key []byte, value []byte, _ NodeResolverFn, depth int
if !bytes.Equal(bt.Stem, key[:StemSize]) {
bitStem := bt.Stem[bt.depth/8] >> (7 - (bt.depth % 8)) & 1
n := &InternalNode{depth: bt.depth, mustRecompute: true}
n := &InternalNode{depth: bt.depth, mustRecompute: true, dirty: true}
bt.depth++
// bt is re-parented under n and sits at a new path — rewrite its blob.
bt.mustRecompute = true
bt.dirty = true
var child, other *BinaryNode
if bitStem == 0 {
n.left = bt
@ -77,6 +81,7 @@ func (bt *StemNode) Insert(key []byte, value []byte, _ NodeResolverFn, depth int
Values: values[:],
depth: depth + 1,
mustRecompute: true,
dirty: true,
}
}
return n, nil
@ -86,6 +91,7 @@ func (bt *StemNode) Insert(key []byte, value []byte, _ NodeResolverFn, depth int
}
bt.Values[key[StemSize]] = value
bt.mustRecompute = true
bt.dirty = true
return bt, nil
}
@ -101,6 +107,7 @@ func (bt *StemNode) Copy() BinaryNode {
depth: bt.depth,
hash: bt.hash,
mustRecompute: bt.mustRecompute,
dirty: bt.dirty,
}
}
@ -151,10 +158,14 @@ func (bt *StemNode) Hash() common.Hash {
return bt.hash
}
// CollectNodes collects all child nodes at a given path, and flushes it
// into the provided node collector.
// CollectNodes flushes the stem via the collector when dirty; clean stems
// are skipped.
func (bt *StemNode) CollectNodes(path []byte, flush NodeFlushFn) error {
if !bt.dirty {
return nil
}
flush(path, bt)
bt.dirty = false
return nil
}
@ -172,8 +183,11 @@ func (bt *StemNode) InsertValuesAtStem(key []byte, values [][]byte, _ NodeResolv
if !bytes.Equal(bt.Stem, key[:StemSize]) {
bitStem := bt.Stem[bt.depth/8] >> (7 - (bt.depth % 8)) & 1
n := &InternalNode{depth: bt.depth, mustRecompute: true}
n := &InternalNode{depth: bt.depth, mustRecompute: true, dirty: true}
bt.depth++
// bt is re-parented under n and sits at a new path — rewrite its blob.
bt.mustRecompute = true
bt.dirty = true
var child, other *BinaryNode
if bitStem == 0 {
n.left = bt
@ -199,6 +213,7 @@ func (bt *StemNode) InsertValuesAtStem(key []byte, values [][]byte, _ NodeResolv
Values: values,
depth: n.depth + 1,
mustRecompute: true,
dirty: true,
}
}
return n, nil
@ -209,6 +224,7 @@ func (bt *StemNode) InsertValuesAtStem(key []byte, values [][]byte, _ NodeResolv
if v != nil {
bt.Values[i] = v
bt.mustRecompute = true
bt.dirty = true
}
}
return bt, nil

View file

@ -379,6 +379,7 @@ func TestStemNodeCollectNodes(t *testing.T) {
Stem: stem,
Values: values[:],
depth: 0,
dirty: true,
}
var collectedPaths [][]byte
@ -412,3 +413,44 @@ func TestStemNodeCollectNodes(t *testing.T) {
t.Errorf("Path mismatch: expected [0, 1, 0], got %v", collectedPaths[0])
}
}
// TestStemNodeCollectNodesSkipsClean verifies that a clean stem is not
// flushed, and that flushing a dirty stem clears its dirty flag so that
// a subsequent CollectNodes on the same node is a no-op.
func TestStemNodeCollectNodesSkipsClean(t *testing.T) {
stem := make([]byte, 31)
node := &StemNode{
Stem: stem,
Values: make([][]byte, 256),
depth: 0,
}
var collected []BinaryNode
flushFn := func(_ []byte, n BinaryNode) { collected = append(collected, n) }
if err := node.CollectNodes([]byte{0}, flushFn); err != nil {
t.Fatalf("CollectNodes on clean stem: %v", err)
}
if len(collected) != 0 {
t.Fatalf("expected clean stem not to be flushed, got %d", len(collected))
}
node.dirty = true
if err := node.CollectNodes([]byte{0}, flushFn); err != nil {
t.Fatalf("CollectNodes on dirty stem: %v", err)
}
if len(collected) != 1 {
t.Fatalf("expected dirty stem to be flushed once, got %d", len(collected))
}
if node.dirty {
t.Errorf("stem dirty flag should be cleared after flush")
}
collected = nil
if err := node.CollectNodes([]byte{0}, flushFn); err != nil {
t.Fatalf("CollectNodes after flush: %v", err)
}
if len(collected) != 0 {
t.Errorf("expected no flush on clean stem, got %d", len(collected))
}
}

View file

@ -779,3 +779,123 @@ func TestGetStorageNonMembershipInternalRoot(t *testing.T) {
t.Fatalf("expected nil/zero for non-existent storage, got %x", got)
}
}
// commitKeyN derives a distinct 32-byte key from a seed integer. Used by
// TestBinaryTrieCommitIncremental and BenchmarkCollectNodes_SparseWrite to
// populate a trie with many disjoint stems.
func commitKeyN(i int) [HashSize]byte {
var k [HashSize]byte
binary.BigEndian.PutUint64(k[:8], uint64(i)*0x9e3779b97f4a7c15)
binary.BigEndian.PutUint64(k[8:16], uint64(i)*0xc2b2ae3d27d4eb4f)
binary.BigEndian.PutUint64(k[16:24], uint64(i)*0x165667b19e3779f9)
binary.BigEndian.PutUint64(k[24:32], uint64(i)*0x85ebca77c2b2ae63)
return k
}
// TestBinaryTrieCommitIncremental verifies that a second Commit with only a
// single modified leaf flushes only the path from that leaf to the root,
// not the entire tree.
func TestBinaryTrieCommitIncremental(t *testing.T) {
tr := &BinaryTrie{
root: NewBinaryNode(),
tracer: trie.NewPrevalueTracer(),
}
const n = 512
keys := make([][HashSize]byte, n)
for i := range n {
keys[i] = commitKeyN(i + 1)
var v [HashSize]byte
binary.BigEndian.PutUint64(v[24:], uint64(i+1))
var err error
tr.root, err = tr.root.Insert(keys[i][:], v[:], nil, 0)
if err != nil {
t.Fatalf("Insert %d: %v", i, err)
}
}
_, ns1 := tr.Commit(false)
if len(ns1.Nodes) == 0 {
t.Fatal("first Commit produced empty NodeSet")
}
if len(ns1.Nodes) < n {
t.Fatalf("first Commit: expected at least %d nodes, got %d", n, len(ns1.Nodes))
}
// Second Commit on the same trie with no modifications: NodeSet must
// be empty because every subtree is clean.
_, nsNoop := tr.Commit(false)
if len(nsNoop.Nodes) != 0 {
t.Fatalf("no-op Commit: expected empty NodeSet, got %d nodes", len(nsNoop.Nodes))
}
// Modify a single leaf's value. Only the path from that leaf to the
// root should appear in the next Commit's NodeSet.
var newVal [HashSize]byte
newVal[0] = 0xff
var err error
tr.root, err = tr.root.Insert(keys[n/2][:], newVal[:], nil, 0)
if err != nil {
t.Fatalf("Insert (modify): %v", err)
}
_, ns2 := tr.Commit(false)
// Path length for a binary trie of n=512 stems is bounded by the
// internal depth at which the modified stem sits. Allow generous
// slack: up to 64 nodes is fine, anywhere near n (512) is a regression.
if len(ns2.Nodes) == 0 {
t.Fatal("modified Commit produced empty NodeSet")
}
if len(ns2.Nodes) > 64 {
t.Fatalf("modified Commit: expected small NodeSet, got %d nodes (first Commit had %d)", len(ns2.Nodes), len(ns1.Nodes))
}
if len(ns2.Nodes) >= len(ns1.Nodes) {
t.Fatalf("expected second NodeSet (%d) to be smaller than first (%d)", len(ns2.Nodes), len(ns1.Nodes))
}
}
// BenchmarkCollectNodes_SparseWrite measures Commit cost when only one leaf
// changes between blocks — the common case for state updates. After warm-up
// (populate + initial Commit), each iteration modifies a single leaf and
// re-Commits. Under the skip-clean optimization, each iteration flushes
// only the root-to-leaf path; pre-fix behavior would re-flush the entire
// tree every iteration.
func BenchmarkCollectNodes_SparseWrite(b *testing.B) {
const n = 10_000
tr := &BinaryTrie{
root: NewBinaryNode(),
tracer: trie.NewPrevalueTracer(),
}
keys := make([][HashSize]byte, n)
for i := range n {
keys[i] = commitKeyN(i + 1)
var v [HashSize]byte
binary.BigEndian.PutUint64(v[24:], uint64(i+1))
var err error
tr.root, err = tr.root.Insert(keys[i][:], v[:], nil, 0)
if err != nil {
b.Fatalf("Insert %d: %v", i, err)
}
}
// Flush the initial tree so subsequent Commits reflect the
// single-modification workload we want to measure.
_, _ = tr.Commit(false)
var newVal [HashSize]byte
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
idx := i % n
binary.BigEndian.PutUint64(newVal[24:], uint64(i+1))
var err error
tr.root, err = tr.root.Insert(keys[idx][:], newVal[:], nil, 0)
if err != nil {
b.Fatalf("Insert at iter %d: %v", i, err)
}
_, ns := tr.Commit(false)
if len(ns.Nodes) == 0 {
b.Fatalf("iter %d: empty NodeSet", i)
}
}
}