trie/bintrie: port dirty flag + CollectNodes skip-clean from master

Master added (via PR #34754) a dirty bool to InternalNode/StemNode plus a
CollectNodes short-circuit that skips clean subtrees — the arena branch
diverged before that landed. Port the semantics onto the arena shape:

- Add dirty bool to InternalNode and StemNode.
- Wire dirty=true alongside every existing mustRecompute=true setter in
  node_store.go (newInternalRef, newStemRef) and store_ops.go (8 mutation
  sites across InsertSingle/insertSingleInternal/InsertValuesAtStem/
  insertValuesAtStem/splitStemInsert/splitStemValuesInsert).
- Add 'if !node.dirty { return nil }' gate at the top of CollectNodes for
  both KindInternal and KindStem; clear dirty after flushfn runs.
- Plumb a dirty parameter through deserializeNode; DeserializeNode passes
  dirty=true (safe default), DeserializeNodeWithHash passes dirty=false
  (loaded from disk, blob matches).

The arena test in trie_test.go that was auto-merged from master used
master-shape struct literals (tr.root, NewBinaryNode) that don't exist on
arena; delete those and replace with TestCommitSkipCleanSubtrees, an
arena-native version that asserts first-Commit flushes all nodes, no-op
Commit flushes none, and single-leaf Commit flushes only the root-to-leaf
path.
This commit is contained in:
CPerezz 2026-04-18 18:45:12 +02:00
parent 1a37d82231
commit 939b36345f
No known key found for this signature in database
GPG key ID: 62045F34B97177DD
6 changed files with 63 additions and 95 deletions

View file

@ -34,9 +34,12 @@ func keyToPath(depth int, key []byte) ([]byte, error) {
return path, nil
}
// Invariant: dirty=false implies mustRecompute=false. Every mutation that
// invalidates the cached hash MUST also mark the blob for re-flush.
type InternalNode struct {
left, right NodeRef
depth uint8
mustRecompute bool
mustRecompute bool // hash is stale (cleared by Hash)
dirty bool // on-disk blob is stale (cleared by CollectNodes)
hash common.Hash
}

View file

@ -84,6 +84,7 @@ func (s *NodeStore) newInternalRef(depth int) NodeRef {
n := s.getInternal(idx)
n.depth = uint8(depth)
n.mustRecompute = true
n.dirty = true
return MakeRef(KindInternal, idx)
}
@ -119,6 +120,7 @@ func (s *NodeStore) newStemRef(stem []byte, depth int) NodeRef {
copy(sn.Stem[:], stem[:StemSize])
sn.depth = uint8(depth)
sn.mustRecompute = true
sn.dirty = true
return MakeRef(KindStem, idx)
}

View file

@ -23,6 +23,9 @@ import (
)
// StemNode holds up to 256 values sharing a 31-byte stem, packed via bitmap.
//
// Invariant: dirty=false implies mustRecompute=false. Every mutation that
// invalidates the cached hash MUST also mark the blob for re-flush.
type StemNode struct {
Stem [StemSize]byte
bitmap [StemBitmapSize]byte
@ -31,7 +34,8 @@ type StemNode struct {
depth uint8
shared bool // true if valueData is shared with serialized input
mustRecompute bool // true if the hash needs to be recomputed
mustRecompute bool // hash is stale (cleared by Hash)
dirty bool // on-disk blob is stale (cleared by CollectNodes)
hash common.Hash // cached hash when mustRecompute == false
}

View file

@ -124,17 +124,19 @@ func (s *NodeStore) SerializeNode(ref NodeRef) []byte {
var errInvalidSerializedLength = errors.New("invalid serialized node length")
// DeserializeNode deserializes a node from bytes, recomputing its hash.
// DeserializeNode deserializes a node from bytes, recomputing its hash. The
// returned node is marked dirty (provenance unknown, safe re-flush default).
func (s *NodeStore) DeserializeNode(serialized []byte, depth int) (NodeRef, error) {
return s.deserializeNode(serialized, depth, common.Hash{}, true)
return s.deserializeNode(serialized, depth, common.Hash{}, true, true)
}
// DeserializeNodeWithHash deserializes a node, using the provided hash.
// DeserializeNodeWithHash deserializes a node whose hash is already known and
// whose blob is already on disk (mustRecompute=false, dirty=false).
func (s *NodeStore) DeserializeNodeWithHash(serialized []byte, depth int, hn common.Hash) (NodeRef, error) {
return s.deserializeNode(serialized, depth, hn, false)
return s.deserializeNode(serialized, depth, hn, false, false)
}
func (s *NodeStore) deserializeNode(serialized []byte, depth int, hn common.Hash, mustRecompute bool) (NodeRef, error) {
func (s *NodeStore) deserializeNode(serialized []byte, depth int, hn common.Hash, mustRecompute, dirty bool) (NodeRef, error) {
if len(serialized) == 0 {
return EmptyRef, nil
}
@ -164,6 +166,7 @@ func (s *NodeStore) deserializeNode(serialized []byte, depth int, hn common.Hash
node.hash = hn
node.mustRecompute = false
}
node.dirty = dirty
return ref, nil
case nodeTypeStem:
@ -191,6 +194,7 @@ func (s *NodeStore) deserializeNode(serialized []byte, depth int, hn common.Hash
sn.depth = uint8(depth)
sn.hash = hn
sn.mustRecompute = mustRecompute
sn.dirty = dirty
return MakeRef(KindStem, stemIdx), nil
default:
@ -198,13 +202,18 @@ func (s *NodeStore) deserializeNode(serialized []byte, depth int, hn common.Hash
}
}
// CollectNodes flushes every node via flushfn in post-order.
// CollectNodes flushes every node that needs flushing via flushfn in post-order.
// Invariant: any ancestor of a node that needs flushing is itself marked, so a
// clean root means the whole subtree is clean.
func (s *NodeStore) CollectNodes(ref NodeRef, path []byte, flushfn NodeFlushFn) error {
switch ref.Kind() {
case KindEmpty:
return nil
case KindInternal:
node := s.getInternal(ref.Index())
if !node.dirty {
return nil
}
leftPath := make([]byte, len(path)+1)
copy(leftPath, path)
leftPath[len(path)] = 0
@ -218,9 +227,15 @@ func (s *NodeStore) CollectNodes(ref NodeRef, path []byte, flushfn NodeFlushFn)
return err
}
flushfn(path, s.ComputeHash(ref), s.SerializeNode(ref))
node.dirty = false
return nil
case KindStem:
sn := s.getStem(ref.Index())
if !sn.dirty {
return nil
}
flushfn(path, s.ComputeHash(ref), s.SerializeNode(ref))
sn.dirty = false
return nil
case KindHashed:
return nil // Already committed

View file

@ -193,6 +193,7 @@ func (s *NodeStore) InsertSingle(stem []byte, suffix byte, value []byte, resolve
if sn.Stem == [StemSize]byte(stem[:StemSize]) {
sn.setValue(suffix, value)
sn.mustRecompute = true
sn.dirty = true
return nil
}
newRoot := s.splitStemInsert(s.root, stem, suffix, value, int(sn.depth))
@ -218,6 +219,7 @@ func (s *NodeStore) insertSingleInternal(stem []byte, suffix byte, value []byte,
case KindInternal:
node := s.getInternal(cur.Index())
node.mustRecompute = true
node.dirty = true
bit := stem[node.depth/8] >> (7 - (node.depth % 8)) & 1
pathStack[pathLen] = pathEntry{internalIdx: cur.Index(), isLeft: bit == 0}
pathLen++
@ -232,6 +234,7 @@ func (s *NodeStore) insertSingleInternal(stem []byte, suffix byte, value []byte,
if sn.Stem == [StemSize]byte(stem[:StemSize]) {
sn.setValue(suffix, value)
sn.mustRecompute = true
sn.dirty = true
return nil
}
// Different stem — split
@ -338,6 +341,7 @@ func (s *NodeStore) splitStemInsert(existingRef NodeRef, newStem []byte, suffix
copy(newSn.Stem[:], newStem[:StemSize])
newSn.depth = uint8(existingDepth + 1)
newSn.mustRecompute = true
newSn.dirty = true
newSn.setValue(suffix, value)
newStemRef := MakeRef(KindStem, newStemIdx)
@ -426,6 +430,7 @@ func (s *NodeStore) insertValuesAtStem(ref NodeRef, stem []byte, values [][]byte
node.right = newChild
}
node.mustRecompute = true
node.dirty = true
return ref, nil
case KindStem:
@ -436,6 +441,7 @@ func (s *NodeStore) insertValuesAtStem(ref NodeRef, stem []byte, values [][]byte
if v != nil {
sn.setValue(byte(i), v)
sn.mustRecompute = true
sn.dirty = true
}
}
return ref, nil
@ -470,6 +476,7 @@ func (s *NodeStore) insertValuesAtStem(ref NodeRef, stem []byte, values [][]byte
copy(sn.Stem[:], stem[:StemSize])
sn.depth = uint8(depth)
sn.mustRecompute = true
sn.dirty = true
for i, v := range values {
if v != nil {
sn.count++
@ -526,6 +533,7 @@ func (s *NodeStore) splitStemValuesInsert(existingRef NodeRef, newStem []byte, v
copy(newSn.Stem[:], newStem[:StemSize])
newSn.depth = nNode.depth + 1
newSn.mustRecompute = true
newSn.dirty = true
for i, v := range values {
if v != nil {
newSn.setValue(byte(i), v)

View file

@ -762,36 +762,29 @@ func TestGetStorageNonMembershipInternalRoot(t *testing.T) {
}
}
// commitKeyN derives a distinct 32-byte key from a seed integer. Used by
// TestBinaryTrieCommitIncremental and BenchmarkCollectNodes_SparseWrite to
// populate a trie with many disjoint stems.
func commitKeyN(i int) [HashSize]byte {
var k [HashSize]byte
binary.BigEndian.PutUint64(k[:8], uint64(i)*0x9e3779b97f4a7c15)
binary.BigEndian.PutUint64(k[8:16], uint64(i)*0xc2b2ae3d27d4eb4f)
binary.BigEndian.PutUint64(k[16:24], uint64(i)*0x165667b19e3779f9)
binary.BigEndian.PutUint64(k[24:32], uint64(i)*0x85ebca77c2b2ae63)
return k
}
// TestBinaryTrieCommitIncremental verifies that a second Commit with only a
// single modified leaf flushes only the path from that leaf to the root,
// not the entire tree.
func TestBinaryTrieCommitIncremental(t *testing.T) {
// TestCommitSkipCleanSubtrees verifies that CollectNodes short-circuits on
// clean subtrees. First Commit flushes every resolved node; a follow-up
// Commit with no modifications flushes nothing; a single-leaf modification
// flushes only the root-to-leaf path.
func TestCommitSkipCleanSubtrees(t *testing.T) {
tr := &BinaryTrie{
root: NewBinaryNode(),
store: NewNodeStore(),
tracer: trie.NewPrevalueTracer(),
}
const n = 512
keys := make([][HashSize]byte, n)
const n = 200
key := func(i int) [HashSize]byte {
var k [HashSize]byte
binary.BigEndian.PutUint64(k[:8], uint64(i+1)*0x9e3779b97f4a7c15)
binary.BigEndian.PutUint64(k[8:16], uint64(i+1)*0xc2b2ae3d27d4eb4f)
binary.BigEndian.PutUint64(k[16:24], uint64(i+1)*0x165667b19e3779f9)
binary.BigEndian.PutUint64(k[24:32], uint64(i+1)*0x85ebca77c2b2ae63)
return k
}
for i := range n {
keys[i] = commitKeyN(i + 1)
k := key(i)
var v [HashSize]byte
binary.BigEndian.PutUint64(v[24:], uint64(i+1))
var err error
tr.root, err = tr.root.Insert(keys[i][:], v[:], nil, 0)
if err != nil {
if err := tr.store.Insert(k[:], v[:], nil); err != nil {
t.Fatalf("Insert %d: %v", i, err)
}
}
@ -800,84 +793,27 @@ func TestBinaryTrieCommitIncremental(t *testing.T) {
if len(ns1.Nodes) == 0 {
t.Fatal("first Commit produced empty NodeSet")
}
if len(ns1.Nodes) < n {
t.Fatalf("first Commit: expected at least %d nodes, got %d", n, len(ns1.Nodes))
}
// Second Commit on the same trie with no modifications: NodeSet must
// be empty because every subtree is clean.
_, nsNoop := tr.Commit(false)
if len(nsNoop.Nodes) != 0 {
t.Fatalf("no-op Commit: expected empty NodeSet, got %d nodes", len(nsNoop.Nodes))
t.Fatalf("no-op Commit: expected empty NodeSet, got %d", len(nsNoop.Nodes))
}
// Modify a single leaf's value. Only the path from that leaf to the
// root should appear in the next Commit's NodeSet.
// Modify a single leaf — only the root-to-leaf path should flush.
k := key(n / 2)
var newVal [HashSize]byte
newVal[0] = 0xff
var err error
tr.root, err = tr.root.Insert(keys[n/2][:], newVal[:], nil, 0)
if err != nil {
if err := tr.store.Insert(k[:], newVal[:], nil); err != nil {
t.Fatalf("Insert (modify): %v", err)
}
_, ns2 := tr.Commit(false)
// Path length for a binary trie of n=512 stems is bounded by the
// internal depth at which the modified stem sits. Allow generous
// slack: up to 64 nodes is fine, anywhere near n (512) is a regression.
if len(ns2.Nodes) == 0 {
t.Fatal("modified Commit produced empty NodeSet")
}
if len(ns2.Nodes) > 64 {
t.Fatalf("modified Commit: expected small NodeSet, got %d nodes (first Commit had %d)", len(ns2.Nodes), len(ns1.Nodes))
if len(ns2.Nodes) > 32 {
t.Fatalf("modified Commit: expected ≤32 nodes (path+stem), got %d", len(ns2.Nodes))
}
if len(ns2.Nodes) >= len(ns1.Nodes) {
t.Fatalf("expected second NodeSet (%d) to be smaller than first (%d)", len(ns2.Nodes), len(ns1.Nodes))
}
}
// BenchmarkCollectNodes_SparseWrite measures Commit cost when only one leaf
// changes between blocks — the common case for state updates. After warm-up
// (populate + initial Commit), each iteration modifies a single leaf and
// re-Commits. Under the skip-clean optimization, each iteration flushes
// only the root-to-leaf path; pre-fix behavior would re-flush the entire
// tree every iteration.
func BenchmarkCollectNodes_SparseWrite(b *testing.B) {
const n = 10_000
tr := &BinaryTrie{
root: NewBinaryNode(),
tracer: trie.NewPrevalueTracer(),
}
keys := make([][HashSize]byte, n)
for i := range n {
keys[i] = commitKeyN(i + 1)
var v [HashSize]byte
binary.BigEndian.PutUint64(v[24:], uint64(i+1))
var err error
tr.root, err = tr.root.Insert(keys[i][:], v[:], nil, 0)
if err != nil {
b.Fatalf("Insert %d: %v", i, err)
}
}
// Flush the initial tree so subsequent Commits reflect the
// single-modification workload we want to measure.
_, _ = tr.Commit(false)
var newVal [HashSize]byte
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
idx := i % n
binary.BigEndian.PutUint64(newVal[24:], uint64(i+1))
var err error
tr.root, err = tr.root.Insert(keys[idx][:], newVal[:], nil, 0)
if err != nil {
b.Fatalf("Insert at iter %d: %v", i, err)
}
_, ns := tr.Commit(false)
if len(ns.Nodes) == 0 {
b.Fatalf("iter %d: empty NodeSet", i)
}
}
}