nomt: optimize Hash() pipeline — pool hashers, eliminate redundant sorts, in-place merge

Performance optimizations to the NOMT storage engine while preserving correctness (all triecompare cross-validation tests pass at 10K+ scale): - Pool SHA256 hashers via sync.Pool in HashInternal and HashStem - Replace allStems map with sorted slice + O(N+M) merge (in-place fast path for incremental updates avoids allocation entirely) - Add UpdateSorted to db.DB, skipping redundant sort of pre-sorted ops - Simplify canonicalRoot to use pre-sorted allStems directly - Optimize StemSharedBits with byte-level XOR + bits.LeadingZeros8 - Replace stemLess loops with bytes.Compare in all locations - Eliminate per-stem map alloc in groupAndHashStems (use [256]bool dirty) - Use stack-allocated [248]bool for downBits in BuildInternalTree - Remove unused stemPathCmp function BenchmarkHash/10000/nomt: 9.8ms → 8.2ms (-16%) BenchmarkBlockWorkload/nomt: 7.7ms → 6.6ms (-14%) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-06-19 21:31:37 +00:00 · 2026-02-13 01:11:59 +08:00 · 2026-02-13 01:11:59 +08:00 · 036e37809e
commit 036e37809e
parent d61dd875d8
6 changed files with 164 additions and 98 deletions
--- a/nomt/core/hasher.go
+++ b/nomt/core/hasher.go
@ -1,6 +1,10 @@
 package core

-import "crypto/sha256"
+import (
+	"crypto/sha256"
+	"hash"
+	"sync"
+)

 const (
 	// StemSize is the number of bytes in a stem path (248 bits).
@ -13,13 +17,19 @@ const (
 	HashSize = 32
 )

+var sha256Pool = sync.Pool{
+	New: func() any { return sha256.New() },
+}
+
 // HashInternal computes SHA256(left || right) matching EIP-7864's InternalNode.Hash().
 func HashInternal(data *InternalData) Node {
-	h := sha256.New()
+	h := sha256Pool.Get().(hash.Hash)
+	h.Reset()
 	h.Write(data.Left[:])
 	h.Write(data.Right[:])
 	var out Node
 	h.Sum(out[:0])
+	sha256Pool.Put(h)
 	return out
 }

@ -38,7 +48,7 @@ func HashStem(stem [StemSize]byte, values [StemNodeWidth][]byte) Node {
 		}
 	}

-	h := sha256.New()
+	h := sha256Pool.Get().(hash.Hash)
 	for level := 1; level <= 8; level++ {
 		for i := range StemNodeWidth / (1 << level) {
 			if data[i*2] == (Node{}) && data[i*2+1] == (Node{}) {
@ -58,5 +68,6 @@ func HashStem(stem [StemSize]byte, values [StemNodeWidth][]byte) Node {
 	h.Write(data[0][:])
 	var out Node
 	h.Sum(out[:0])
+	sha256Pool.Put(h)
 	return out
 }
--- a/nomt/core/update.go
+++ b/nomt/core/update.go
@ -1,5 +1,7 @@
 package core

+import "math/bits"
+
 // StemKeyValue is a resolved (stemPath, stemHash) pair for the page tree.
 // The stem hash is precomputed by the integration layer using HashStem.
 type StemKeyValue struct {
@ -32,17 +34,26 @@ type WriteNode struct {
 // StemSharedBits counts the number of shared prefix bits between two stem
 // paths, starting after `skip` bits.
 func StemSharedBits(a, b *StemPath, skip int) int {
-	count := 0
-	maxBits := StemSize * 8 // 248
-	for i := skip; i < maxBits; i++ {
-		aBit := (a[i/8] >> (7 - i%8)) & 1
-		bBit := (b[i/8] >> (7 - i%8)) & 1
-		if aBit != bBit {
-			break
+	startByte := skip / 8
+
+	// Handle partial first byte if skip is not byte-aligned.
+	if skip%8 != 0 {
+		mask := byte(0xFF >> (skip % 8))
+		xor := (a[startByte] ^ b[startByte]) & mask
+		if xor != 0 {
+			return bits.LeadingZeros8(xor) - (skip % 8)
 		}
-		count++
+		startByte++
 	}
-	return count
+
+	// Compare full bytes.
+	for i := startByte; i < StemSize; i++ {
+		xor := a[i] ^ b[i]
+		if xor != 0 {
+			return i*8 + bits.LeadingZeros8(xor) - skip
+		}
+	}
+	return StemSize*8 - skip
 }

 // BuildInternalTree builds a compact internal-node sub-trie from sorted
@ -132,9 +143,10 @@ func BuildInternalTree(skip int, ops []StemKeyValue, visit func(WriteNode)) Node
 		}
 		stemEndBit := skip + stemDepth

+		var downBuf [StemSize * 8]bool
 		var downBits []bool
 		if stemEndBit > downStart {
-			downBits = make([]bool, stemEndBit-downStart)
+			downBits = downBuf[:stemEndBit-downStart]
 			for i := downStart; i < stemEndBit; i++ {
 				downBits[i-downStart] = stemBitAt(thisStem, i)
 			}
@ -191,15 +203,3 @@ func BuildInternalTree(skip int, ops []StemKeyValue, visit func(WriteNode)) Node
 func stemBitAt(stem *StemPath, idx int) bool {
 	return (stem[idx/8]>>(7-idx%8))&1 == 1
 }
-
-func stemPathCmp(a, b *StemPath) int {
-	for i := range a {
-		if a[i] < b[i] {
-			return -1
-		}
-		if a[i] > b[i] {
-			return 1
-		}
-	}
-	return 0
-}
--- a/nomt/db/db.go
+++ b/nomt/db/db.go
@ -6,6 +6,7 @@
 package db

 import (
+	"bytes"
 	"crypto/rand"
 	"fmt"
 	"os"
@ -127,14 +128,18 @@ func (db *DB) SyncSeqn() uint32 {
 	return db.syncSeqn
 }

-// Update applies a sorted batch of stem key-value pairs to the trie.
-//
-// The pairs must be pre-sorted by stem path. The function:
-//  1. Builds a PageSet from Bitbox
-//  2. Runs the parallel PageWalker to produce updated pages
-//  3. Persists updated pages via Bitbox sync
-//  4. Returns the new root hash
+// Update applies a batch of stem key-value pairs to the trie.
+// The pairs are sorted internally before processing.
 func (db *DB) Update(ops []core.StemKeyValue) (core.Node, error) {
+	sort.Slice(ops, func(i, j int) bool {
+		return stemLess(&ops[i].Stem, &ops[j].Stem)
+	})
+	return db.UpdateSorted(ops)
+}
+
+// UpdateSorted applies a pre-sorted batch of stem key-value pairs to the trie.
+// The caller must ensure ops are sorted by stem path.
+func (db *DB) UpdateSorted(ops []core.StemKeyValue) (core.Node, error) {
 	if len(ops) == 0 {
 		return db.Root(), nil
 	}
@ -142,11 +147,6 @@ func (db *DB) Update(ops []core.StemKeyValue) (core.Node, error) {
 	db.mu.Lock()
 	defer db.mu.Unlock()

-	// Sort by stem path.
-	sort.Slice(ops, func(i, j int) bool {
-		return stemLess(&ops[i].Stem, &ops[j].Stem)
-	})
-
 	pageSetFactory := func() merkle.PageSet {
 		return newBitboxPageSet(db.bb)
 	}
@ -249,13 +249,5 @@ func pageIDKey(id core.PageID) string {
 }

 func stemLess(a, b *core.StemPath) bool {
-	for i := range a {
-		if a[i] < b[i] {
-			return true
-		}
-		if a[i] > b[i] {
-			return false
-		}
-	}
-	return false
+	return bytes.Compare(a[:], b[:]) < 0
 }
--- a/trie/nomttrie/stem.go
+++ b/trie/nomttrie/stem.go
@ -1,6 +1,7 @@
 package nomttrie

 import (
+	"bytes"
 	"sort"

 	"github.com/ethereum/go-ethereum/ethdb"
@ -54,16 +55,19 @@ func loadStemValues(diskdb ethdb.Database, stem core.StemPath) ([core.StemNodeWi
 }

 // writeStemValues writes updated stem values to an ethdb batch.
-// Nil values delete the key; non-nil values overwrite.
-func writeStemValues(batch ethdb.Batch, stem core.StemPath, updates map[byte][]byte) error {
-	for suffix, value := range updates {
-		key := stemValueDBKey(stem, suffix)
-		if value == nil {
+// Only slots marked dirty are written. Nil values delete the key.
+func writeStemValues(batch ethdb.Batch, stem core.StemPath, values [core.StemNodeWidth][]byte, dirty [core.StemNodeWidth]bool) error {
+	for i, d := range dirty {
+		if !d {
+			continue
+		}
+		key := stemValueDBKey(stem, byte(i))
+		if values[i] == nil {
 			if err := batch.Delete(key); err != nil {
 				return err
 			}
 		} else {
-			if err := batch.Put(key, value); err != nil {
+			if err := batch.Put(key, values[i]); err != nil {
 				return err
 			}
 		}
@ -110,16 +114,16 @@ func groupAndHashStems(
 		}

 		// Apply updates.
-		flatUpdates := make(map[byte][]byte, 4)
+		var dirty [core.StemNodeWidth]bool
 		for idx < len(updates) && updates[idx].Stem == stem {
 			u := updates[idx]
 			values[u.Suffix] = u.Value
-			flatUpdates[u.Suffix] = u.Value
+			dirty[u.Suffix] = true
 			idx++
 		}

 		// Write to flat state.
-		if err := writeStemValues(batch, stem, flatUpdates); err != nil {
+		if err := writeStemValues(batch, stem, values, dirty); err != nil {
 			return nil, err
 		}

@ -147,13 +151,5 @@ func groupAndHashStems(

 // stemLess compares two stem paths lexicographically.
 func stemLess(a, b *core.StemPath) bool {
-	for i := range a {
-		if a[i] < b[i] {
-			return true
-		}
-		if a[i] > b[i] {
-			return false
-		}
-	}
-	return false
+	return bytes.Compare(a[:], b[:]) < 0
 }
--- a/trie/nomttrie/stem_test.go
+++ b/trie/nomttrie/stem_test.go
@ -59,13 +59,17 @@ func TestWriteStemValues(t *testing.T) {
 	var stem core.StemPath
 	stem[0] = 0xCC

-	// Write a value.
+	// Write a value at slot 3.
 	val := make([]byte, 32)
 	val[0] = 0x42

+	var values [core.StemNodeWidth][]byte
+	var dirty [core.StemNodeWidth]bool
+	values[3] = val
+	dirty[3] = true
+
 	batch := diskdb.NewBatch()
-	updates := map[byte][]byte{3: val}
-	require.NoError(t, writeStemValues(batch, stem, updates))
+	require.NoError(t, writeStemValues(batch, stem, values, dirty))
 	require.NoError(t, batch.Write())

 	// Verify it was written.
@ -74,9 +78,11 @@ func TestWriteStemValues(t *testing.T) {
 	assert.Equal(t, val, data)

 	// Delete it.
+	values[3] = nil
+	dirty[3] = true
+
 	batch = diskdb.NewBatch()
-	deletes := map[byte][]byte{3: nil}
-	require.NoError(t, writeStemValues(batch, stem, deletes))
+	require.NoError(t, writeStemValues(batch, stem, values, dirty))
 	require.NoError(t, batch.Write())

 	has, err := diskdb.Has(stemValueDBKey(stem, 3))
--- a/trie/nomttrie/trie.go
+++ b/trie/nomttrie/trie.go
@ -7,8 +7,8 @@
 package nomttrie

 import (
+	"bytes"
 	"encoding/binary"
-	"sort"

 	"github.com/ethereum/go-ethereum/common"
 	"github.com/ethereum/go-ethereum/core/types"
@ -47,20 +47,23 @@ type NomtTrie struct {
 	pending []stemUpdate     // accumulated stem updates
 	dirty   bool             // whether pending updates exist

-	// allStems tracks the stem hash for every active stem in the trie.
-	// Updated on each Hash() with results from groupAndHashStems.
-	// Used to compute the canonical root via BuildInternalTree(skip=0).
-	allStems map[core.StemPath]core.Node
+	// allStems tracks the stem hash for every active stem in the trie,
+	// kept sorted by stem path. Updated on each Hash() via sorted merge
+	// with results from groupAndHashStems.
+	allStems []core.StemKeyValue
+
+	// mergeBuf is reused across Hash() calls to avoid allocating a new
+	// slice on every merge. After merge, allStems and mergeBuf swap roles.
+	mergeBuf []core.StemKeyValue
 }

 // New creates a new NomtTrie. The root parameter is the current state root.
 func New(root common.Hash, backend *nomtdb.Database) (*NomtTrie, error) {
 	return &NomtTrie{
-		nomtDB:   backend.NomtDB(),
-		backend:  backend,
-		root:     root,
-		pending:  make([]stemUpdate, 0, 64),
-		allStems: make(map[core.StemPath]core.Node, 64),
+		nomtDB:  backend.NomtDB(),
+		backend: backend,
+		root:    root,
+		pending: make([]stemUpdate, 0, 64),
 	}, nil
 }

@ -234,22 +237,23 @@ func (t *NomtTrie) Hash() common.Hash {
 		return t.root
 	}

-	// Update allStems with new/changed stem hashes.
-	for _, kv := range stemKVs {
-		t.allStems[kv.Stem] = kv.Hash
-	}
+	// Merge sorted stemKVs into allStems (both are sorted by stem path).
+	// Swap allStems and mergeBuf to reuse backing arrays across calls.
+	merged := mergeStemKVs(t.allStems, stemKVs, t.mergeBuf)
+	t.mergeBuf = t.allStems
+	t.allStems = merged

 	// Update the page tree for persistent storage.
+	// stemKVs is already sorted, so skip the redundant sort in db.Update.
 	if len(stemKVs) > 0 {
-		if _, err := t.nomtDB.Update(stemKVs); err != nil {
+		if _, err := t.nomtDB.UpdateSorted(stemKVs); err != nil {
 			log.Error("NOMT page tree update failed", "err", err)
 			return t.root
 		}
 	}

 	// Compute the canonical root via BuildInternalTree(skip=0).
-	// This produces roots identical to bintrie by avoiding the depth-7
-	// worker split that adds extra wrapping levels.
+	// allStems is already sorted, so no additional sort needed.
 	t.root = common.Hash(t.canonicalRoot())

 	t.pending = t.pending[:0]
@ -258,19 +262,78 @@ func (t *NomtTrie) Hash() common.Hash {
 }

 // canonicalRoot computes the bintrie-compatible root hash from all known stems
-// using BuildInternalTree at skip=0.
+// using BuildInternalTree at skip=0. allStems is already sorted.
 func (t *NomtTrie) canonicalRoot() core.Node {
 	if len(t.allStems) == 0 {
 		return core.Terminator
 	}
-	sorted := make([]core.StemKeyValue, 0, len(t.allStems))
-	for stem, hash := range t.allStems {
-		sorted = append(sorted, core.StemKeyValue{Stem: stem, Hash: hash})
+	return core.BuildInternalTree(0, t.allStems, func(_ core.WriteNode) {})
+}
+
+// mergeStemKVs merges sorted new stemKVs into sorted existing allStems.
+// Existing entries with the same stem are replaced. The result is sorted.
+// The buf parameter is reused for the result to avoid allocation when new
+// stems need to be inserted.
+func mergeStemKVs(existing, updates, buf []core.StemKeyValue) []core.StemKeyValue {
+	if len(updates) == 0 {
+		return existing
 	}
-	sort.Slice(sorted, func(i, j int) bool {
-		return stemLess(&sorted[i].Stem, &sorted[j].Stem)
-	})
-	return core.BuildInternalTree(0, sorted, func(_ core.WriteNode) {})
+	if len(existing) == 0 {
+		return updates
+	}
+
+	// Fast path: check if all updates are in-place replacements (no new stems).
+	// This is the common case for incremental block updates where accounts
+	// already exist in the trie.
+	allInPlace := true
+	ei := 0
+	for _, u := range updates {
+		for ei < len(existing) && bytes.Compare(existing[ei].Stem[:], u.Stem[:]) < 0 {
+			ei++
+		}
+		if ei >= len(existing) || existing[ei].Stem != u.Stem {
+			allInPlace = false
+			break
+		}
+	}
+
+	if allInPlace {
+		// Update hashes in place — zero allocation.
+		ei = 0
+		for _, u := range updates {
+			for existing[ei].Stem != u.Stem {
+				ei++
+			}
+			existing[ei].Hash = u.Hash
+		}
+		return existing
+	}
+
+	// Slow path: some new stems need inserting. Use merge with buffer.
+	needed := len(existing) + len(updates)
+	if cap(buf) < needed {
+		buf = make([]core.StemKeyValue, 0, needed)
+	}
+	result := buf[:0]
+	i, j := 0, 0
+	for i < len(existing) && j < len(updates) {
+		cmp := bytes.Compare(existing[i].Stem[:], updates[j].Stem[:])
+		switch {
+		case cmp < 0:
+			result = append(result, existing[i])
+			i++
+		case cmp > 0:
+			result = append(result, updates[j])
+			j++
+		default:
+			result = append(result, updates[j])
+			i++
+			j++
+		}
+	}
+	result = append(result, existing[i:]...)
+	result = append(result, updates[j:]...)
+	return result
 }

 // Commit flushes pending operations and returns the root hash.
@ -304,10 +367,8 @@ func (t *NomtTrie) IsVerkle() bool {
 func (t *NomtTrie) Copy() *NomtTrie {
 	pending := make([]stemUpdate, len(t.pending))
 	copy(pending, t.pending)
-	allStems := make(map[core.StemPath]core.Node, len(t.allStems))
-	for k, v := range t.allStems {
-		allStems[k] = v
-	}
+	allStems := make([]core.StemKeyValue, len(t.allStems))
+	copy(allStems, t.allStems)
 	return &NomtTrie{
 		nomtDB:   t.nomtDB,
 		backend:  t.backend,