From a145150c39a2e6e4e87385011f347cf490d1aa03 Mon Sep 17 00:00:00 2001 From: weiihann Date: Mon, 9 Mar 2026 21:19:27 +0800 Subject: [PATCH] use pebble instead of custom db --- nomt/DESIGN.md | 1032 +++++++++++++++++------------- nomt/bitbox/bitbox_test.go | 399 ------------ nomt/bitbox/db.go | 284 -------- nomt/bitbox/htfile.go | 127 ---- nomt/bitbox/metamap.go | 118 ---- nomt/bitbox/probe.go | 81 --- nomt/bitbox/recover.go | 73 --- nomt/bitbox/sync.go | 131 ---- nomt/bitbox/wal.go | 296 --------- nomt/bitbox/wal_test.go | 217 ------- nomt/db/db.go | 195 +++--- nomt/db/db_test.go | 84 +-- nomt/merkle/worker_test.go | 2 +- trie/nomttrie/compat_test.go | 7 +- trie/nomttrie/trie_test.go | 8 +- trie/triecompare/compare_test.go | 82 +-- triedb/nomtdb/config.go | 14 +- triedb/nomtdb/database.go | 19 +- 18 files changed, 726 insertions(+), 2443 deletions(-) delete mode 100644 nomt/bitbox/bitbox_test.go delete mode 100644 nomt/bitbox/db.go delete mode 100644 nomt/bitbox/htfile.go delete mode 100644 nomt/bitbox/metamap.go delete mode 100644 nomt/bitbox/probe.go delete mode 100644 nomt/bitbox/recover.go delete mode 100644 nomt/bitbox/sync.go delete mode 100644 nomt/bitbox/wal.go delete mode 100644 nomt/bitbox/wal_test.go diff --git a/nomt/DESIGN.md b/nomt/DESIGN.md index c9307a289a..e05abb7a11 100644 --- a/nomt/DESIGN.md +++ b/nomt/DESIGN.md @@ -1,527 +1,665 @@ -# Design Document: NOMT-Style Binary Merkle Tree for Geth +# NOMT Binary Merkle Trie on PebbleDB — Design Document -## Context +## 1. Overview -Geth's Merkle Patricia Trie (MPT) is I/O-bound during block execution and state commitment. NOMT (Nearly-Optimal Merkle Trie) addresses this by using a binary merkle trie with SSD-optimized page-based storage, achieving significantly higher throughput through batched updates, aggressive prefetching, and minimal disk reads per operation. +NOMT (Nearly-Optimal Merkle Trie) is a page-based binary merkle trie engine +integrated into geth as an alternative to the Merkle Patricia Trie (MPT). It +stores trie nodes in fixed-size 4KB pages optimized for SSD I/O, and uses +parallel batch updates for high throughput during block execution. -This document describes how to port NOMT's architecture into geth as a **new, independent binary merkle trie implementation** alongside the existing MPT. The existing MPT code (`trie/`, `triedb/hashdb/`, `triedb/pathdb/`) remains untouched — NOMT is added as a new `triedb` backend option. NOMT has two core components: -- **Beatree** (B-tree for flat key-value storage) — replaced by PebbleDB -- **Bitbox** (on-disk hash table storing merkle tree pages) — the focus of this implementation +This implementation stores all data — trie pages, flat account/storage state, +and stem values — in geth's existing PebbleDB instance under dedicated key +prefixes. There is no custom storage engine; PebbleDB's LSM-tree, atomic +batches, and bloom filters handle persistence and crash safety. + +``` ++-----------------------------------------------------------------------+ +| Ethereum State Layer | +| (StateDB: accounts, storage slots, contract code) | ++----------------------------------+------------------------------------+ + | + +----------v-----------+ + | NomtTrie | + | (state.Trie impl) | + | trie/nomttrie/ | + +----------+-----------+ + | + +--------------------+--------------------+ + | | + +----------v-----------+ +-------------v-----------+ + | Canonical Root | | Page Tree Engine | + | BuildInternalTree | | nomt/merkle/ | + | (248-bit tree over | | (PageWalker + workers) | + | all stem hashes) | +-------------+-----------+ + +----------------------+ | + +-------------v-----------+ + | nomt/db/ | + | (PebbleDB page store) | + +-------------+-----------+ + | + +--------------------+--------------------+ + | | | + +----------v------+ +---------v--------+ +--------v---------+ + | Flat State | | Stem Values | | Trie Pages | + | prefix 0x01-0x02 | | prefix 0x03 | | prefix 0x04 | + | (accts, storage) | | (per-slot values)| | (4KB RawPage) | + +------------------+ +------------------+ +------------------+ + | | | + +--------------------+--------------------+ + | + +----------v-----------+ + | PebbleDB | + | (single instance) | + +----------------------+ +``` + +### Design Principles + +1. **Single database**: All NOMT data lives in geth's PebbleDB — no custom + hash table, no WAL, no separate files. +2. **Page-level granularity**: The merkle engine operates on 4KB page blobs, + not individual 32-byte nodes. PebbleDB stores each page as a single KV pair. +3. **EIP-7864 compatibility**: Key derivation, stem hashing, and root + computation produce roots identical to geth's `trie/bintrie/`. +4. **Unchanged merkle engine**: The `nomt/merkle/` package (PageWalker, + parallel workers) has zero dependency on the storage backend — it accesses + pages through the `PageSet` interface. --- -## 1. NOMT Architecture Overview +## 2. PebbleDB Key Schema -### 1.1 Binary Merkle Trie - -NOMT uses a sparse binary merkle trie where all lookup paths are 256 bits and all nodes are exactly 32 bytes. Three node types exist: - -| Type | Value | MSB | Children | -|------|-------|-----|----------| -| **Internal** | `hash(left \|\| right)` | `0` | Two child nodes | -| **Leaf** | `hash(key_path \|\| value_hash)` | `1` | None | -| **Terminator** | `0x00...00` (all zeros) | N/A | None (empty subtrie) | - -The MSB (most significant bit) labeling enables O(1) node type discrimination. All node preimages are 512 bits (64 bytes). - -**Key insight**: Because every node is exactly 32 bytes, groups of nodes can be packed into fixed-size pages with predictable layouts — no variable-length encoding needed. - -### 1.2 Page Structure - -Each page is **4096 bytes** (aligned to SSD page size) and stores a **rootless sub-binary-tree of depth 6**: +All NOMT data shares the same PebbleDB instance as geth's other subsystems. +Each NOMT key type uses a distinct single-byte prefix: ``` -Page Layout (4096 bytes): -┌─────────────────────────────────────────┐ -│ 126 nodes × 32 bytes = 4032 bytes │ Nodes at depths 1-6 -│ Level 1: 2 nodes (siblings) │ (rootless — the root lives -│ Level 2: 4 nodes │ in the parent page) -│ Level 3: 8 nodes │ -│ Level 4: 16 nodes │ -│ Level 5: 32 nodes │ -│ Level 6: 64 nodes (leaf layer) │ -├─────────────────────────────────────────┤ -│ 24 bytes padding │ -├─────────────────────────────────────────┤ -│ 8 bytes: ElidedChildren bitfield (u64) │ Which child pages are elided -├─────────────────────────────────────────┤ -│ 32 bytes: PageID (encoded) │ Unique page identifier -└─────────────────────────────────────────┘ +PebbleDB Key Layout +==================== + +Prefix Key Format Value Description +------ ---------------------------------------- --------------- --------------------------- +0x01 0x01 || accountHash[32] RLP(SlimAcct) Account flat state +0x02 0x02 || accountHash[32] || slotHash[32] raw bytes Storage flat state +0x03 0x03 || stem[31] || suffix[1] value[32] Stem value slot +0x04 0x04 || PageID.Encode()[32] RawPage[4032] Trie page blob +0x05 0x05 || "root" Node[32] Page tree root hash ``` -Each page has up to **64 child pages**, one for each leaf position. The tree of pages has maximum depth 42 (since 6 × 42 = 252 ≈ 256 bits). +Key properties: -### 1.3 Page Identification - -A `PageID` is a path through the page tree — a sequence of 0-42 child indices (each 0..63). The encoding uses a base-64-like scheme: - -``` -Encoding: For path [c₀, c₁, ..., cₙ]: - result = 0 - for each cᵢ: - result += (cᵢ + 1) - result <<= 6 -``` - -This produces a unique, ordered 256-bit representation stored in the last 32 bytes of each page. The ordering property ensures depth-first traversal: parent < children < right siblings. - -**Key path mapping**: A 256-bit key maps to a chain of pages. Every 6 bits of the key selects a child index at the corresponding page depth. - -### 1.4 Page Elision - -If a subtree rooted at a page has fewer than **20 leaves** (the `PAGE_ELISION_THRESHOLD`), that page is not stored on disk. Instead, it is reconstructed on-the-fly from the parent page's nodes. The `ElidedChildren` bitfield (8 bytes, one bit per child slot) tracks which child pages are elided. - -This optimization significantly reduces storage and I/O for sparse regions of the trie. +- **0x01–0x02** (flat state): Used by `triedb/nomtdb/` for geth's standard + `StateReader` interface. Accounts are RLP-encoded `SlimAccount` structs. +- **0x03** (stem values): The 256 value slots per stem node. Each slot stores + a 32-byte value (account basic data, code hash, storage value, or code + chunk). Key = 33 bytes, value = 32 bytes. +- **0x04** (trie pages): The binary merkle page tree. Key = 33 bytes (prefix + + PageID encoding). Value = 4032 bytes (page contents; the trailing PageID and + metadata within the 4096-byte `RawPage` are included). +- **0x05** (metadata): Currently stores only the page tree root hash, + persisted atomically with page updates. Enables root recovery on restart. --- -## 2. Bitbox: The Page Store +## 3. Binary Merkle Trie Structure -Bitbox is an on-disk **open-addressing hash table** that maps PageIDs to 4096-byte pages. +### 3.1. Two-Layer Tree -### 2.1 Hash Table File Layout +The trie has two logical layers matching EIP-7864: ``` -HT File Layout: -┌──────────────────────────────────────────┐ -│ Meta Byte Pages │ ceil(num_buckets / 4096) pages -│ One byte per bucket (occupancy + tag) │ -├──────────────────────────────────────────┤ -│ Data Pages │ num_buckets pages -│ Each bucket = one 4096-byte page │ (the actual merkle tree pages) -└──────────────────────────────────────────┘ - -Total file size = (meta_pages + num_buckets) × 4096 + Root (canonical) + | + BuildInternalTree over 248-bit stem paths + | + +--------+-----------+-----------+--------+ + | | | | | + stem_0 stem_1 ... stem_k ... stem_n (depth 248) + | | | | + [256 slots each: SHA256 sub-tree of values] ``` -### 2.2 Meta Map +**Internal tree (depth 0–247)**: Binary SHA256 tree where each "leaf" is an +opaque 32-byte stem hash. Navigated by bits 0–247 of the stem path. This is +what `BuildInternalTree(skip=0)` computes — its root is the canonical state +root returned by `Hash()`. -One byte per bucket encodes state + hash tag for fast probing: - -| Byte Value | Meaning | -|------------|---------| -| `0x00` | Empty bucket | -| `0x7F` | Tombstone (deleted) | -| `0x80 \| (hash >> 57)` | Occupied, with 7-bit hash tag | - -The 7-bit hash tag enables filtering ~99% of non-matching buckets without reading the full page from disk. - -### 2.3 Probing - -Bitbox uses **triangular probing** (probe offsets: 0, 1, 3, 6, 10, ...): +**Stem nodes (depth 248)**: Each stem holds 256 value slots indexed by the +last byte (suffix). The stem hash is computed as: ``` -bucket₀ = hash(pageID) % capacity -bucketᵢ = (bucket₀ + i*(i+1)/2) % capacity +SHA256(stem_path[31] || 0x00 || subtree_root) + +where subtree_root = 8-level binary SHA256 tree over SHA256(value_i) for i in 0..255 ``` -The hash function is `xxhash3_64` with a random 16-byte seed (generated at DB creation). +### 3.2. Page Tree (Persistent Storage) -**Page lookup flow**: -1. Compute `hash(pageID)` → initial bucket -2. Check meta byte: empty → miss, tombstone → skip, tag mismatch → skip -3. On tag match: read the 4096-byte data page from disk -4. Verify: check if the PageID in the last 32 bytes matches -5. If mismatch (rare): continue probing - -### 2.4 WAL (Write-Ahead Log) - -Crash recovery uses a simple binary WAL: +For persistent storage, the 248-bit internal tree is partitioned into a tree +of 4KB pages. Each page stores a rootless sub-binary-tree of depth 6: ``` -WAL Format: -[START tag (1 byte)] [sync_seqn (4 bytes LE)] -repeated: - [CLEAR tag (1 byte)] [bucket (8 bytes LE)] - [UPDATE tag (1 byte)] [page_id (32 bytes)] [page_diff (16 bytes)] - [changed_nodes (N × 32 bytes)] [elided_children (8 bytes)] [bucket (8 bytes LE)] -[END tag (1 byte)] -[zero-padded to 4096-byte boundary] +Page Tree Organization +====================== + + Root Page (depth 0) <- 1 page, 126 nodes + / | \ + Child 0 ... Child k ... Child 63 <- up to 64 child pages + / \ / \ + ... ... ... ... <- up to 64^2 pages at depth 2 + (max depth 42: 6*42=252 bits) + +Each page: ++-----------------------------------------------+ +| 126 internal nodes (levels 1-6), 32 bytes each | 4032 bytes +| | +| Level 1: 2 nodes (left/right of root) | +| Level 2: 4 nodes | +| Level 3: 8 nodes | +| Level 4: 16 nodes | +| Level 5: 32 nodes | +| Level 6: 64 nodes (bottom layer) | +| | +| The root of this sub-tree lives in the | +| parent page's bottom layer (level 6). | ++-------------------------------------------------+ +| 24 bytes padding | ++-------------------------------------------------+ +| ElidedChildren: 8-byte bitfield (uint64 LE) | Which of the 64 +| bit i = 1 means child page i is elided | children are stored ++-------------------------------------------------+ inline (not on disk) +| PageID: 32-byte encoded identifier | ++-------------------------------------------------+ +Total: 4096 bytes (SSD page aligned) ``` -**Recovery protocol**: -1. On open, if WAL is non-empty, read sync sequence number -2. If it matches the expected sequence, replay all entries (apply diffs to HT pages, update meta map) -3. Write changed meta pages to HT file -4. Truncate and fsync WAL +**Page elision**: If a subtree has few leaves, its page is not stored on disk. +Instead, the sub-tree data lives inline in the parent page's bottom-layer +nodes. The `ElidedChildren` bitfield tracks which of the 64 child slots are +elided. This avoids storing nearly-empty pages for sparse trie regions. -### 2.5 Sync Protocol +### 3.3. PageID Encoding -Persisting dirty pages follows a strict three-phase protocol: +A `PageID` is a path through the page tree — a sequence of child indices +(each 0–63). The encoding produces a unique 32-byte key for PebbleDB: ``` -Phase 1: begin_sync() - ├── Lock meta map (write) - ├── For each dirty page: - │ ├── Allocate or reuse bucket via probing - │ ├── Update meta map (set_full or set_tombstone) - │ └── Record changes in WAL builder - ├── Build HT page write list - └── Update page cache (batch insert + evict) +PageID Encoding (shift-then-add) +================================= -Phase 2: wait_pre_meta() - └── Write WAL to disk + fsync (atomic durability point) +For path [c_0, c_1, ..., c_n]: + value = 0 + for each c_i: + value = (value << 6) + (c_i + 1) -Phase 3: [External] Write meta/manifest (atomic sync point) +Store as big-endian 32 bytes. -Phase 4: post_meta() - ├── Write dirty HT pages + meta pages to HT file + fsync - └── Truncate WAL (no fsync needed — see rationale below) +Examples: + Root page: path=[] -> 0x00...00 + Child 0: path=[0] -> 0x00...01 + Child 63: path=[63] -> 0x00...40 (64 decimal) + [5, 10]: path=[5,10] -> 0x00...016B ((6<<6)+11 = 395) + +Properties: + - Root encodes to all zeros + - Lexicographic ordering: parent < children < right siblings + - Unique: no two distinct paths produce the same encoding + - Max depth 42 (6*42 = 252 bits, fits in 256-bit key) ``` -**Why truncate without fsync**: If we crash before the next commit, the WAL replay is idempotent. If we reach the next commit, the new WAL write will fsync. - --- -## 3. Merkle Tree Updates: The Page Walker +## 4. Hash Functions -The `PageWalker` is the core algorithm for batch-updating the binary merkle trie. It processes sorted key-value updates left-to-right through the page tree. - -### 3.1 Sub-Trie Replacement - -Updates are grouped by which terminal node their keys map to. Each terminal is replaced with a new sub-trie built from the updates: - -- Delete a leaf → replace with terminator -- Insert where terminator was → replace with leaf -- Multiple inserts at same prefix → build an internal sub-trie -- Delete + insert at same key → replace leaf with new leaf - -### 3.2 Partial Compaction - -After each replacement, the walker hashes upward (computing internal node hashes) and compacts terminators. It stops at the point where the next update would also affect the result, avoiding redundant work. The last update hashes all the way to the root. - -### 3.3 Algorithm Sketch +All hashing uses **SHA256** (EIP-7864). There is no MSB tagging — nodes are +either all-zero (terminator) or opaque 32-byte hashes. ``` -PageWalker.advance_and_replace(terminal_position, operations): - 1. Build page stack down to terminal position - (loading existing pages or creating fresh ones) - 2. Build new sub-trie from operations (build_trie) - 3. Place new nodes into the current page - 4. Hash upward, compacting terminators: - while sibling(current) is terminator or leaf: - compact (merge leaf up or create terminator pair) - stop when next update will affect this path +Internal node: SHA256(left[32] || right[32]) + Both children are 32-byte nodes. -PageWalker.conclude(): - 1. Hash all remaining nodes up to root - 2. Emit root node + list of UpdatedPage entries +Stem node: SHA256(stem_path[31] || 0x00 || subtree_root[32]) + subtree_root = 8-level binary SHA256 tree over + SHA256(value_i) for i in 0..255 + (zero-hash pairs produce zero parent, pruning empty branches) + +Terminator: 0x00...00 (32 zero bytes) + Represents an empty sub-trie at any position. ``` -### 3.4 Parallel Updates (Workers) - -For large batches, the page tree is partitioned into regions (by root page children). Each region is processed by a separate worker goroutine: - -1. **Warm-up phase**: Prefetch pages that will be needed (walk keys, load pages from cache/disk) -2. **Update phase**: Run PageWalker on the region's subset of updates -3. **Merge phase**: Collect child page root nodes, update the root page +SHA256 hashers are pooled via `sync.Pool` to avoid allocation pressure during +batch hashing. --- -## 4. Flat Key-Value Storage (PebbleDB) +## 5. Update Pipeline -PebbleDB replaces NOMT's Beatree for storing raw key-value data. This is the "value store" that sits alongside Bitbox: - -### 4.1 Key Schema +### 5.1. Per-Block Flow ``` -Account data: key = 0x01 || keccak256(address) → RLP(SlimAccount) -Storage data: key = 0x02 || keccak256(address) || keccak256(slot) → value -Metadata: key = 0x00 || "root" → current root node (32 bytes) - key = 0x00 || "sync_seqn" → sync sequence number (4 bytes) - key = 0x00 || "seed" → hash table seed (16 bytes) +Block Execution +=============== + +1. StateDB accumulates changes + (UpdateAccount, UpdateStorage, UpdateContractCode) + | + v +2. NomtTrie.pending collects stemUpdates + Each update = (stem[31], suffix[1], value[32]) + | + v +3. NomtTrie.Hash() triggers flush: + | + +---> groupAndHashStems() + | | + | +-- Group by stem path (stable sort) + | +-- For each stem: + | | Load existing values from PebbleDB (prefix 0x03) + | | Merge new values, compute SHA256 stem hash + | | Write updated values back (batch) + | +-- Return sorted []StemKeyValue + | + +---> mergeStemKVs() + | Merge new stems into allStems (sorted in-place) + | Fast path: in-place update when no new stems added + | + +---> db.Update(stemKVs) + | Run PageWalker on page tree + | Persist updated pages to PebbleDB (prefix 0x04) + | Persist new page tree root (prefix 0x05) + | All writes in single atomic batch + | + +---> canonicalRoot() + BuildInternalTree(skip=0, allStems) + Returns 32-byte root matching bintrie exactly ``` -PebbleDB handles compaction, compression, and point-lookup optimization via bloom filters. +### 5.2. Page Update Engine (nomt/merkle/) + +The PageWalker processes sorted stem updates left-to-right through the page +tree. It loads pages from PebbleDB via the `PageSet` interface, modifies +nodes in place, and emits a list of `UpdatedPage` entries. + +``` +PageWalker Algorithm +==================== + +Input: sorted [(stem_path, stem_hash)] + current page tree root +Output: new root + list of UpdatedPage entries + +For each stem update: + 1. Descend through page stack to the target position + (load pages from PebbleDB or create fresh ones) + + 2. Place the stem hash at the target node position + + 3. Hash upward through the page tree: + - Compute SHA256(left || right) for modified internal nodes + - Compact terminator pairs (both children zero -> parent zero) + - Stop when remaining nodes will be affected by future updates + +After all updates: + 4. Hash remaining nodes up to the root + 5. Return new root + all modified pages + +Page Stack (in-memory during walk): + +--------+--------+--------+ + | Root | Child | Grand- | ...up to 42 pages deep + | Page | Page | child | + +--------+--------+--------+ + depth 0 depth 1 depth 2 +``` + +### 5.3. Parallel Workers + +For batches with 64+ updates, the page tree is partitioned by root page child +index (first 6 bits of each stem path = 64 possible buckets). Independent +subtrees are processed concurrently: + +``` +Parallel Update (depth-7 split) +================================ + + Root Page + / | \ + child 0 child k child 63 (64 slots) + | | | + +---+---+ +--+--+ +--+---+ + |Worker1| |W...k| |Worker| N goroutines + +---+---+ +--+--+ +--+---+ + | | | + [pages] [pages] [pages] each worker's UpdatedPages + \ | / + +-------+------+ + | + Merge child roots into + root page, persist all + pages in atomic batch +``` + +Each worker gets an independent `PageSet` (via `pageSetFactory`) to avoid +contention. After workers complete, their child-page roots are merged into +the root page. --- -## 5. Go Implementation Plan +## 6. PebbleDB Page Storage (nomt/db/) -### 5.1 Package Structure +The `pebblePageSet` implements `merkle.PageSet` backed by PebbleDB: + +``` +pebblePageSet +============= + + PageWalker + | + PageSet.Get(id) + | + +------------+------------+ + | | + cache[encoded_id]? diskdb.Get(0x04||id) + / \ | + hit: copy miss +-------+-------+ + & return | | + found: not found: + copy to cache, return fresh + return copy zeroed page + +IMPORTANT: Always return a COPY of cached pages. +The PageWalker mutates pages in place during updates. +A shared reference would corrupt the cache. +``` + +Page persistence uses PebbleDB's atomic batch writes: + +``` +Atomic Batch Write +================== + +batch := diskdb.NewBatch() + +for each UpdatedPage: + if page was cleared: + batch.Delete(0x04 || PageID.Encode()) + else: + batch.Put(0x04 || PageID.Encode(), page[0:4096]) + +batch.Put(0x05 || "root", new_root[0:32]) // persist root atomically +batch.Write() // single atomic operation + +No WAL needed — PebbleDB guarantees atomic batch writes. +If crash before Write(): no pages or root updated (safe). +If crash after Write(): all pages and root updated (consistent). +``` + +--- + +## 7. EIP-7864 Key Derivation + +Key derivation delegates to `trie/bintrie/` to guarantee identical key +generation. The 32-byte key is split into a 31-byte stem and 1-byte suffix: + +``` +EIP-7864 Key Layout +==================== + +|<--------- stem (31 bytes, 248 bits) --------->|<- suffix (1 byte) ->| + +Internal tree navigates bits 0-247 (stem path). +Stem node holds 256 value slots indexed by suffix (0-255). + +Account Keys: + key = SHA256(SHA256(address) || base_offset) + stem = key[0:31] + BasicData: suffix = 0 (nonce at [8:16], balance at [16:32]) + CodeHash: suffix = 1 (32-byte code hash) + +Storage Keys: + key = SHA256(SHA256(address) || storage_offset) + stem = key[0:31], suffix = key[31] + storage_offset encodes slot position within 256-slot groups + +Code Chunk Keys: + chunks = ChunkifyCode(bytecode) (31-byte chunks, right-padded) + For chunk number N: + groupOffset = (N + 128) % 256 + if groupOffset == 0 or N == 0: + offset[24:32] = uint64_le(N + 128) + key = SHA256(SHA256(address) || offset) + stem = key[0:31] + suffix = groupOffset +``` + +--- + +## 8. Canonical Root vs. Page Tree Root + +The system computes two related but distinct roots: + +``` +Root Computation +================ + +1. Canonical Root (returned by Hash()): + BuildInternalTree(skip=0, allStems) + - Pure computation over sorted (stem, hash) pairs + - 248-bit binary tree, no page structure + - Identical to bintrie's root for the same state + - This is the state root in block headers + +2. Page Tree Root (persisted in PebbleDB): + merkle.ParallelUpdate(root, stemKVs, workers, pageSetFactory) + - Partitioned into 4KB pages at 6-bit boundaries + - Workers split at depth 7, adding SHA256(hash||zeros) wrapping + - Root may differ from canonical due to wrapping levels + - Used for persistent page storage and incremental updates + +The page tree root is an implementation detail. Only the canonical +root (from BuildInternalTree) is externally visible. +``` + +--- + +## 9. Package Structure ``` go-ethereum/ nomt/ - core/ - node.go # Node type, KeyPath, ValueHash, NodeKind, TERMINATOR - hasher.go # NodeHasher interface, keccak256-based binary hasher - page.go # Page constants, RawPage read/write helpers - pageid.go # PageID encode/decode, child/parent, iterator - triepos.go # TriePosition: depth tracking, page navigation - pagediff.go # PageDiff: 126-bit change tracking bitfield - update.go # build_trie helper, WriteNode, leaf splicing - bitbox/ - db.go # DB handle: open, sync entry point, utilization - htfile.go # HTOffsets, file creation, layout math - metamap.go # MetaMap: per-bucket metadata byte - probe.go # ProbeSequence: triangular probing, xxhash - pagecache.go # Sharded LRU page cache with fixed-level pinning - pageloader.go # PageLoader: probe-based page retrieval - wal.go # WAL writer and reader (blob format) - writeout.go # write_wal, write_ht, truncate_wal - sync.go # SyncController: begin_sync, wait_pre_meta, post_meta - recover.go # WAL recovery logic - merkle/ - pagewalker.go # Left-to-right batch trie update engine - pageset.go # PageSet interface for walker's page access - elided.go # ElidedChildren 64-bit bitfield - worker.go # Parallel update workers (warm-up + sharded) - io/ - directio.go # O_DIRECT helpers (linux/darwin build tags) - pagepool.go # sync.Pool-backed 4096-byte aligned page allocator + core/ Pure data structures, no I/O + node.go Node type, Terminator, NodeKind + hasher.go SHA256 hashing (pooled), HashInternal, HashStem + page.go RawPage [4096]byte, level-order node access + pageid.go PageID encode/decode, child/parent navigation + pagediff.go 126-bit change tracking bitfield + triepos.go TriePosition: depth tracking, page boundary detection + update.go StemKeyValue, BuildInternalTree, StemSharedBits + + merkle/ Page-based update engine, storage-agnostic + pageset.go PageSet interface, MemoryPageSet + pagewalker.go Left-to-right batch trie updates + worker.go Parallel workers (partitioned at depth 7) + elided.go ElidedChildren 64-bit bitfield + + db/ PebbleDB integration layer + db.go DB struct, pebblePageSet, atomic batch writes + Key prefixes: 0x04 (pages), 0x05 (metadata) + + trie/ + nomttrie/ state.Trie implementation + trie.go NomtTrie: UpdateAccount/Storage/Code, Hash, Commit + key_encoding.go EIP-7864 key derivation (delegates to bintrie) + stem.go Stem value storage, groupAndHashStems + triedb/ - nomtdb/ - config.go # NomtDB configuration - database.go # Database implementing backend interface - reader.go # NodeReader and StateReader implementations -``` - -### 5.2 Key Data Structures - -```go -// --- core/node.go --- -type Node = [32]byte -type KeyPath = [32]byte -type ValueHash = [32]byte -var TERMINATOR Node // all zeros - -type NodeKind int -const ( - NodeKindTerminator NodeKind = iota - NodeKindLeaf - NodeKindInternal -) - -// --- core/page.go --- -const ( - Depth = 6 - NodesPerPage = 126 // (2^7) - 2 - PageSize = 4096 - NumChildren = 64 // 2^Depth -) - -// --- core/pageid.go --- -type PageID struct { - path []uint8 // each 0..63, len 0..42 -} - -// --- bitbox/db.go --- -type DB struct { - pagePool *PagePool - store HTOffsets - seed [16]byte - metaMap *sync.RWMutex // guards MetaMap - walFD *os.File - htFD *os.File - capacity int - occupied atomic.Int64 -} - -// --- bitbox/probe.go --- -type ProbeSequence struct { - hash uint64 - bucket uint64 - step uint64 -} -``` - -### 5.3 Hash Function - -For Ethereum compatibility, use **keccak256** with MSB labeling: - -```go -func HashInternal(left, right Node) Node { - h := crypto.Keccak256(left[:], right[:]) - var node Node - copy(node[:], h) - node[0] &= 0x7F // clear MSB → internal - return node -} - -func HashLeaf(keyPath KeyPath, valueHash ValueHash) Node { - h := crypto.Keccak256(keyPath[:], valueHash[:]) - var node Node - copy(node[:], h) - node[0] |= 0x80 // set MSB → leaf - return node -} -``` - -> **Trade-off note**: NOMT defaults to Blake3 (~3-5x faster). The `NodeHasher` interface makes this swappable. For production, a protocol change could adopt Blake3. - -### 5.4 I/O Adaptation - -| NOMT (Rust) | Go Port | -|-------------|---------| -| `io_uring` (Linux) | `os.File.ReadAt` / `WriteAt` + goroutine pool | -| `pread`/`pwrite` (fallback) | Same via `os.File` | -| `fcntl(F_NOCACHE)` (macOS) | Same via `syscall.Syscall` | -| `O_DIRECT` (Linux) | `syscall.O_DIRECT` + aligned buffers | -| mmap WAL builder | `[]byte` buffer (WAL is small) | -| `threadpool` + channels | goroutines + `chan` | - -### 5.5 Geth Integration - -The NOMT backend plugs into geth's `triedb` framework as a new backend option alongside HashDB and PathDB: - -```go -// triedb/database.go — add to Config -type Config struct { - HashDB *hashdb.Config - PathDB *pathdb.Config - NomtDB *nomtdb.Config // NEW -} - -// nomtdb/database.go — implements backend interface -type Database struct { - bitbox *bitbox.DB - pebble *pebble.DB - cache *bitbox.PageCache - root core.Node - syncSeq uint32 - lock sync.RWMutex -} - -func (db *Database) NodeReader(root common.Hash) (database.NodeReader, error) -func (db *Database) StateReader(root common.Hash) (database.StateReader, error) -func (db *Database) Commit(root common.Hash, report bool) error -``` - -**StateReader** serves accounts and storage directly from PebbleDB (flat reads, no trie traversal for data). **NodeReader** resolves pages from Bitbox and returns individual node values. - -### 5.6 Update Flow (Per Block) - -``` -1. StateDB collects changed accounts/storage slots - -2. Build update set: - For each changed key: - key_path = keccak256(key) - value_hash = keccak256(value) // or TERMINATOR if deleted - Sort by key_path - -3. Run PageWalker (parallel workers for large batches): - Input: sorted [(key_path, value_hash)] + current page tree - Output: new root node + list of UpdatedPages - -4. Persist: - a. Write flat KV changes to PebbleDB (batch write) - b. Sync dirty pages via SyncController: - begin_sync → WAL write → meta update → HT write - -5. Return new root hash (the 32-byte root node) + nomtdb/ triedb backend + config.go Config (NumWorkers) + database.go Database: NodeReader, StateReader, DiskDB, NomtDB + reader.go Flat state readers, key prefix constants (0x01, 0x02) ``` --- -## 6. Implementation Phases +## 10. Data Flow Diagram -### Phase 1: Core Primitives -**Goal**: All trie data structures, no I/O. +Complete flow for a single block's state update: -Files: `nomt/core/*.go` + tests - -- Node types, hashing, MSB labeling -- Page layout (read/write nodes, elided children, page ID from page) -- PageID encode/decode/iterate -- TriePosition traversal -- PageDiff bitfield operations - -**Milestone**: All unit tests pass. Hash outputs match NOMT's Rust tests. - -### Phase 2: Page Walker -**Goal**: In-memory batch update engine. - -Files: `nomt/merkle/*.go` + tests - -- ElidedChildren bitfield -- PageSet interface + in-memory implementation -- PageWalker: advance_and_replace, conclude -- build_trie helper - -**Milestone**: Walker produces correct root hashes and page diffs for known inputs. - -### Phase 3: Bitbox Storage -**Goal**: On-disk hash table. - -Files: `nomt/bitbox/*.go` (htfile, metamap, probe, pagecache, pageloader, db) + `nomt/io/*.go` - -- Page pool (sync.Pool, aligned allocation) -- Direct I/O helpers (linux/darwin) -- MetaMap operations -- HT file creation and opening -- Probe sequence + page loading -- Page cache (sharded LRU with fixed-level pinning) - -**Milestone**: Can create HT file, insert pages, read them back. Cache serves hot pages. - -### Phase 4: WAL and Sync -**Goal**: Crash-safe persistence. - -Files: `nomt/bitbox/wal.go`, `sync.go`, `writeout.go`, `recover.go` + tests - -- WAL blob writer/reader -- SyncController three-phase protocol -- Recovery from WAL -- writeout helpers (write_wal, write_ht, truncate_wal) - -**Milestone**: Full sync cycle works. Simulated crash recovery restores correct state. - -### Phase 5: Geth Backend -**Goal**: Wire into geth's triedb framework. - -Files: `triedb/nomtdb/*.go` + modifications to `triedb/database.go` - -- Config, Database, NodeReader, StateReader -- PebbleDB flat KV integration -- Update flow: StateSet → PageWalker → SyncController -- Add NomtDB case to triedb.NewDatabase and triedb.Update - -**Milestone**: geth configured with NOMT backend can read/write state via standard interfaces. - -### Phase 6: Parallel Workers + Optimization -**Goal**: Throughput optimization. - -- Parallel page walkers (sharded by root page children) -- Warm-up phase (prefetch pages before updates) -- Benchmarks vs. existing backends +``` ++-----------+ UpdateAccount(addr, acc) +------------+ +| StateDB | -----------------------------> | NomtTrie | +| | UpdateStorage(addr, k, v) | | +| | -----------------------------> | pending: | +| | UpdateContractCode(addr, c) | [{stem, | +| | -----------------------------> | suffix, | ++-----------+ | value}] | + +-----+------+ + | + Hash() called + | + +-----v------+ + | groupAnd | + | HashStems | + +-----+------+ + | + +-------------------------+-------------------------+ + | | + +-----v------+ +-----v------+ + | Load stem | | Compute | + | values | | stem hash | + | from 0x03 | | SHA256 | + | prefix | | sub-tree | + +-----+------+ +-----+------+ + | | + +-----v------+ | + | Merge new | | + | values | | + +-----+------+ | + | | + +-----v------+ | + | Write back | | + | to 0x03 | | + | (batch) | | + +------------+ | + | + +---------------------------------------------------+ + | + +-----v----------+ + | []StemKeyValue | sorted by stem path + +-----+----------+ + | + +-----------+-----------+ + | | + +-----v------+ +-----v------+ + | Merge into | | db.Update | + | allStems | +-----+------+ + | (sorted) | | + +-----+------+ +-----v-----------+ + | | ParallelUpdate | + | | (PageWalker x N)| + | +-----+-----------+ + | | + | +-----v------+ + | | PebbleDB | + | | batch: | + | | put pages | + | | put root | + | +------------+ + | + +-----v-----------------+ + | BuildInternalTree | + | skip=0, allStems | + +-----+-----------------+ + | + +-----v------+ + | Canonical | <-- returned by Hash() + | state root | matches bintrie exactly + +------------+ +``` --- -## 7. Key Source Files (Reference) +## 11. Crash Safety -| Component | NOMT Source | Purpose | -|-----------|-------------|---------| -| Node types | `nomt/core/src/trie.rs` | Node, KeyPath, TERMINATOR, NodeKind | -| Hasher | `nomt/core/src/hasher.rs` | MSB labeling, hash functions | -| Page layout | `nomt/core/src/page.rs` | DEPTH=6, NODES_PER_PAGE=126 | -| Page IDs | `nomt/core/src/page_id.rs` | Encode/decode, child/parent | -| Trie position | `nomt/core/src/trie_pos.rs` | Depth tracking, node indexing | -| Page diff | `nomt/core/src/page_diff.rs` | 126-bit change bitfield | -| Page walker | `nomt/nomt/src/merkle/page_walker.rs` | Batch update algorithm | -| Page set | `nomt/nomt/src/merkle/page_set.rs` | Page access interface | -| Workers | `nomt/nomt/src/merkle/worker.rs` | Parallel update workers | -| Bitbox DB | `nomt/nomt/src/bitbox/mod.rs` | Hash table DB, sync, probing | -| HT file | `nomt/nomt/src/bitbox/ht_file.rs` | File layout, create/open | -| Meta map | `nomt/nomt/src/bitbox/meta_map.rs` | Per-bucket metadata bytes | -| WAL | `nomt/nomt/src/bitbox/wal.rs` | Write-ahead log format | -| Page cache | `nomt/nomt/src/page_cache.rs` | Sharded LRU cache | +All persistent state changes are made through PebbleDB's atomic batch writes: -| Component | Geth Source | Purpose | -|-----------|-------------|---------| -| Backend interface | `go-ethereum/triedb/database/database.go` | NodeReader, StateReader | -| Backend selector | `go-ethereum/triedb/database.go` | Config, NewDatabase | -| PathDB (reference) | `go-ethereum/triedb/pathdb/database.go` | Similar backend pattern | -| State DB | `go-ethereum/core/state/statedb.go` | State management layer | +``` +Crash Safety Guarantees +======================== + +State changes happen in two atomic batches per Hash() call: + +Batch 1 (stem values): + Write updated stem values to prefix 0x03 + -> If crash here: stem values partially updated, but page tree + and canonical root unchanged. Next Hash() will recompute + stem hashes from flat state, producing correct result. + +Batch 2 (page tree): + Write updated pages to prefix 0x04 + Write new page tree root to prefix 0x05 + -> Atomic: either all pages + root update, or none do. + -> If crash before: pages unchanged, root unchanged. + Next block re-applies the same page updates. + -> If crash after: consistent state, root matches pages. + +Recovery on startup: + 1. Read root from PebbleDB (prefix 0x05) + 2. If found and valid (32 bytes): use as current root + 3. If not found: fresh database, root = Terminator + No WAL replay, no file scanning, no repair needed. +``` --- -## 8. Verification Plan +## 12. Performance Characteristics -1. **Unit tests**: Port NOMT's Rust test cases for PageID, PageDiff, MetaMap, ProbeSequence -2. **Hash compatibility**: Verify node hashes match expected values for known inputs -3. **PageWalker correctness**: Feed same inputs as Rust tests, compare root hashes -4. **WAL recovery**: Simulate crashes at each sync phase, verify recovery -5. **Integration**: Process historical Ethereum blocks, verify state root matches -6. **Benchmarks**: Compare commit latency and throughput vs. HashDB/PathDB backends +``` +Operation Costs +================ + +Read account: 1 PebbleDB point lookup (prefix 0x03, stem slot 0) + + 1 PebbleDB point lookup (stem slot 1 for code hash) + Bloom filter makes misses fast. + +Read storage: 1 PebbleDB point lookup (prefix 0x03) + +Write account: 2 pending stemUpdates (basic data + code hash) + Deferred to Hash() — no I/O during execution. + +Hash() flush: O(S) prefix iterations for S dirty stems (load values) + O(S * log(S)) sort + O(S * 256) SHA256 hashing (stem sub-trees) + O(P) page reads/writes for P affected pages + 2 PebbleDB atomic batch writes + +Parallelism: Page tree updates partitioned across N workers + (default: runtime.NumCPU()) + Workers share no state — each has own PageSet. + Effective for 64+ stems per block. + +Memory: 4KB per cached page (pebblePageSet, per-worker) + ~64 bytes per tracked stem (allStems slice) + SHA256 hasher pool (sync.Pool, reused across calls) +``` + +--- + +## 13. Cross-Validation + +The implementation is validated against geth's `trie/bintrie/` which +independently implements EIP-7864. Both produce identical state roots: + +``` +Cross-Validation Test Matrix +============================== + +Test Accounts Contracts Slots Distributions +--------------------------- -------- --------- ------ ------------- +TestRootEquality/Small 100 50 1-20 PowerLaw +TestRootEquality/Medium 1,000 500 1-100 PowerLaw +TestRootEquality/Large 10,000 5,000 1-500 PowerLaw +TestDistributionVariants 100 50 1-20 PowerLaw,Uniform,Exp +TestIncrementalRootEquality 20 10 1-5 Uniform (per-op) +TestDeterminism 100 50 1-20 PowerLaw (2x same seed) + +All tests verify: bintrie_root == nomt_root at every block boundary. +Race detector enabled on all test runs. +``` diff --git a/nomt/bitbox/bitbox_test.go b/nomt/bitbox/bitbox_test.go deleted file mode 100644 index da2af00833..0000000000 --- a/nomt/bitbox/bitbox_test.go +++ /dev/null @@ -1,399 +0,0 @@ -package bitbox - -import ( - "os" - "path/filepath" - "testing" - - "github.com/ethereum/go-ethereum/nomt/core" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -// --- HT File Layout Tests --- - -func TestHTOffsetsMetaByteOffset(t *testing.T) { - offsets := NewHTOffsets(8192) - assert.Equal(t, int64(pageSize), offsets.MetaByteOffset(0)) - assert.Equal(t, int64(pageSize+1), offsets.MetaByteOffset(1)) -} - -func TestHTOffsetsDataPageOffset(t *testing.T) { - // capacity=4096 → 1 meta page - offsets := NewHTOffsets(4096) - assert.Equal(t, uint64(1), offsets.MetaPages) - - // Data starts at: header(4096) + 1 meta page(4096) = 8192 - assert.Equal(t, int64(8192), offsets.DataPageOffset(0)) - assert.Equal(t, int64(8192+4096), offsets.DataPageOffset(1)) -} - -func TestHTOffsetsTotalFileSize(t *testing.T) { - offsets := NewHTOffsets(4096) - // header(4096) + 1 meta page(4096) + 4096 data pages * 4096 - expected := int64(4096 + 4096 + 4096*4096) - assert.Equal(t, expected, offsets.TotalFileSize()) -} - -func TestHTOffsetsMetaPagesRoundup(t *testing.T) { - offsets := NewHTOffsets(5000) - assert.Equal(t, uint64(2), offsets.MetaPages) -} - -func TestCreateOpenHTFile(t *testing.T) { - dir := t.TempDir() - path := filepath.Join(dir, "test.ht") - - seed := HashSeedFromUint64(42, 99) - f, offsets, err := CreateHTFile(path, 1024, seed) - require.NoError(t, err) - assert.Equal(t, uint64(1024), offsets.Capacity) - f.Close() - - f2, offsets2, seed2, occ, err := OpenHTFile(path) - require.NoError(t, err) - defer f2.Close() - - assert.Equal(t, seed, seed2) - assert.Equal(t, uint64(1024), offsets2.Capacity) - assert.Equal(t, uint64(0), occ) -} - -// --- Meta Byte Tests --- - -func TestMetaByteEncoding(t *testing.T) { - assert.True(t, IsEmpty(MetaEmpty)) - assert.False(t, IsOccupied(MetaEmpty)) - assert.False(t, IsTombstone(MetaEmpty)) - - assert.True(t, IsTombstone(MetaTombstone)) - assert.False(t, IsEmpty(MetaTombstone)) - assert.False(t, IsOccupied(MetaTombstone)) - - occupied := MakeOccupied(0xFFFFFFFFFFFFFFFF) - assert.True(t, IsOccupied(occupied)) - assert.False(t, IsEmpty(occupied)) - assert.False(t, IsTombstone(occupied)) -} - -func TestMetaByteTagMatching(t *testing.T) { - hash := uint64(0xABCDEF1234567890) - meta := MakeOccupied(hash) - assert.True(t, TagMatches(meta, hash)) - - // Different high bits should not match. - differentHash := uint64(0x1234EF1234567890) - assert.False(t, TagMatches(meta, differentHash)) -} - -func TestMetaMapSetGet(t *testing.T) { - mm := NewMetaMap(8192) - assert.Equal(t, MetaEmpty, mm.Get(0)) - - mm.Set(100, MakeOccupied(12345)) - assert.True(t, IsOccupied(mm.Get(100))) -} - -func TestMetaMapDirtyTracking(t *testing.T) { - mm := NewMetaMap(8192) // 2 meta pages - assert.Empty(t, mm.DirtyMetaPages()) - - mm.Set(0, MetaTombstone) // page 0 - mm.Set(5000, MetaTombstone) // page 1 - - dirty := mm.DirtyMetaPages() - assert.Len(t, dirty, 2) - assert.Contains(t, dirty, uint64(0)) - assert.Contains(t, dirty, uint64(1)) - - mm.ClearDirty() - assert.Empty(t, mm.DirtyMetaPages()) -} - -// --- Probe Sequence Tests --- - -func TestProbeSequenceInitial(t *testing.T) { - p := NewProbeSequence(42, 1024) - assert.Equal(t, uint64(42%1024), p.Bucket()) - assert.Equal(t, uint64(42), p.Hash()) -} - -func TestProbeSequenceTriangular(t *testing.T) { - p := NewProbeSequence(0, 16) // initial bucket = 0 - assert.Equal(t, uint64(0), p.Bucket()) - - p.Next() // step=1 → (0+1)%16 = 1 - assert.Equal(t, uint64(1), p.Bucket()) - - p.Next() // step=2 → (1+2)%16 = 3 - assert.Equal(t, uint64(3), p.Bucket()) - - p.Next() // step=3 → (3+3)%16 = 6 - assert.Equal(t, uint64(6), p.Bucket()) - - p.Next() // step=4 → (6+4)%16 = 10 - assert.Equal(t, uint64(10), p.Bucket()) -} - -func TestProbeSequenceVisitsAll(t *testing.T) { - // With power-of-2 capacity, triangular probing should visit all buckets. - capacity := uint64(16) - p := NewProbeSequence(0, capacity) - - visited := make(map[uint64]bool, capacity) - for range capacity { - visited[p.Bucket()] = true - p.Next() - } - - assert.Equal(t, int(capacity), len(visited), - "triangular probing should visit all buckets") -} - -func TestHashPageID(t *testing.T) { - seed := HashSeedFromUint64(1, 2) - root := core.RootPageID() - h1 := HashPageID(seed, root) - h2 := HashPageID(seed, root) - assert.Equal(t, h1, h2, "same inputs should produce same hash") - - // Different seed should produce different hash. - seed2 := HashSeedFromUint64(3, 4) - h3 := HashPageID(seed2, root) - assert.NotEqual(t, h1, h3) -} - -// --- DB Integration Tests --- - -func TestDBCreateAndOpen(t *testing.T) { - dir := t.TempDir() - path := filepath.Join(dir, "test.bitbox") - - seed := HashSeedFromUint64(1, 2) - db, err := Create(path, 1024, seed) - require.NoError(t, err) - assert.Equal(t, uint64(1024), db.Capacity()) - assert.Equal(t, int64(0), db.Occupied()) - require.NoError(t, db.Sync()) - require.NoError(t, db.Close()) - - db2, err := Open(path) - require.NoError(t, err) - defer db2.Close() - assert.Equal(t, seed, db2.Seed()) - assert.Equal(t, uint64(1024), db2.Capacity()) -} - -func TestDBStoreAndLoad(t *testing.T) { - dir := t.TempDir() - path := filepath.Join(dir, "test.bitbox") - - seed := HashSeedFromUint64(1, 2) - db, err := Create(path, 1024, seed) - require.NoError(t, err) - defer db.Close() - - // Store a page. - rootID := core.RootPageID() - page := new(core.RawPage) - page.SetNodeAt(0, core.Node{0x42}) - - bucket, err := db.StorePage(rootID, page) - require.NoError(t, err) - assert.Equal(t, int64(1), db.Occupied()) - - // Load it back. - loaded, loadBucket, found, err := db.LoadPage(rootID) - require.NoError(t, err) - assert.True(t, found) - assert.Equal(t, bucket, loadBucket) - assert.Equal(t, core.Node{0x42}, loaded.NodeAt(0)) -} - -func TestDBStoreOverwrite(t *testing.T) { - dir := t.TempDir() - path := filepath.Join(dir, "test.bitbox") - - seed := HashSeedFromUint64(1, 2) - db, err := Create(path, 1024, seed) - require.NoError(t, err) - defer db.Close() - - rootID := core.RootPageID() - page1 := new(core.RawPage) - page1.SetNodeAt(0, core.Node{0x01}) - _, err = db.StorePage(rootID, page1) - require.NoError(t, err) - - // Overwrite with new data. - page2 := new(core.RawPage) - page2.SetNodeAt(0, core.Node{0x02}) - _, err = db.StorePage(rootID, page2) - require.NoError(t, err) - - // Should still only have 1 occupied. - assert.Equal(t, int64(1), db.Occupied()) - - loaded, _, found, err := db.LoadPage(rootID) - require.NoError(t, err) - assert.True(t, found) - assert.Equal(t, core.Node{0x02}, loaded.NodeAt(0)) -} - -func TestDBDelete(t *testing.T) { - dir := t.TempDir() - path := filepath.Join(dir, "test.bitbox") - - seed := HashSeedFromUint64(1, 2) - db, err := Create(path, 1024, seed) - require.NoError(t, err) - defer db.Close() - - rootID := core.RootPageID() - page := new(core.RawPage) - _, err = db.StorePage(rootID, page) - require.NoError(t, err) - - deleted, err := db.DeletePage(rootID) - require.NoError(t, err) - assert.True(t, deleted) - assert.Equal(t, int64(0), db.Occupied()) - - _, _, found, err := db.LoadPage(rootID) - require.NoError(t, err) - assert.False(t, found) -} - -func TestDBDeleteNonexistent(t *testing.T) { - dir := t.TempDir() - path := filepath.Join(dir, "test.bitbox") - - seed := HashSeedFromUint64(1, 2) - db, err := Create(path, 1024, seed) - require.NoError(t, err) - defer db.Close() - - rootID := core.RootPageID() - deleted, err := db.DeletePage(rootID) - require.NoError(t, err) - assert.False(t, deleted) -} - -func TestDBLoadMiss(t *testing.T) { - dir := t.TempDir() - path := filepath.Join(dir, "test.bitbox") - - seed := HashSeedFromUint64(1, 2) - db, err := Create(path, 1024, seed) - require.NoError(t, err) - defer db.Close() - - rootID := core.RootPageID() - _, _, found, err := db.LoadPage(rootID) - require.NoError(t, err) - assert.False(t, found) -} - -func TestDBMultiplePages(t *testing.T) { - dir := t.TempDir() - path := filepath.Join(dir, "test.bitbox") - - seed := HashSeedFromUint64(1, 2) - db, err := Create(path, 1024, seed) - require.NoError(t, err) - defer db.Close() - - rootID := core.RootPageID() - childID, err := rootID.ChildPageID(0) - require.NoError(t, err) - childID2, err := rootID.ChildPageID(1) - require.NoError(t, err) - - // Store 3 pages. - for i, pid := range []core.PageID{rootID, childID, childID2} { - page := new(core.RawPage) - page.SetNodeAt(0, core.Node{byte(i + 1)}) - _, err := db.StorePage(pid, page) - require.NoError(t, err) - } - - assert.Equal(t, int64(3), db.Occupied()) - - // Load each one. - for i, pid := range []core.PageID{rootID, childID, childID2} { - loaded, _, found, err := db.LoadPage(pid) - require.NoError(t, err) - assert.True(t, found, "page %d", i) - assert.Equal(t, core.Node{byte(i + 1)}, loaded.NodeAt(0)) - } -} - -func TestDBPersistAndReopen(t *testing.T) { - dir := t.TempDir() - path := filepath.Join(dir, "test.bitbox") - - seed := HashSeedFromUint64(1, 2) - db, err := Create(path, 1024, seed) - require.NoError(t, err) - - rootID := core.RootPageID() - page := new(core.RawPage) - page.SetNodeAt(0, core.Node{0xAB}) - _, err = db.StorePage(rootID, page) - require.NoError(t, err) - - require.NoError(t, db.Sync()) - require.NoError(t, db.Close()) - - // Reopen and verify. - db2, err := Open(path) - require.NoError(t, err) - defer db2.Close() - - loaded, _, found, err := db2.LoadPage(rootID) - require.NoError(t, err) - assert.True(t, found) - assert.Equal(t, core.Node{0xAB}, loaded.NodeAt(0)) -} - -func TestDBCapacityMustBePowerOf2(t *testing.T) { - dir := t.TempDir() - path := filepath.Join(dir, "test.bitbox") - seed := HashSeedFromUint64(1, 2) - - _, err := Create(path, 1000, seed) - assert.Error(t, err) - // Cleanup any partial file. - os.Remove(path) -} - -func TestDBDeleteAndReinsert(t *testing.T) { - dir := t.TempDir() - path := filepath.Join(dir, "test.bitbox") - - seed := HashSeedFromUint64(1, 2) - db, err := Create(path, 1024, seed) - require.NoError(t, err) - defer db.Close() - - rootID := core.RootPageID() - - // Insert → delete → insert should work. - page1 := new(core.RawPage) - page1.SetNodeAt(0, core.Node{0x01}) - _, err = db.StorePage(rootID, page1) - require.NoError(t, err) - - _, err = db.DeletePage(rootID) - require.NoError(t, err) - - page2 := new(core.RawPage) - page2.SetNodeAt(0, core.Node{0x02}) - _, err = db.StorePage(rootID, page2) - require.NoError(t, err) - - loaded, _, found, err := db.LoadPage(rootID) - require.NoError(t, err) - assert.True(t, found) - assert.Equal(t, core.Node{0x02}, loaded.NodeAt(0)) -} diff --git a/nomt/bitbox/db.go b/nomt/bitbox/db.go deleted file mode 100644 index bf88a1c8b8..0000000000 --- a/nomt/bitbox/db.go +++ /dev/null @@ -1,284 +0,0 @@ -package bitbox - -import ( - "encoding/binary" - "fmt" - "os" - "sync/atomic" - - "github.com/ethereum/go-ethereum/nomt/core" -) - -// DB is the Bitbox on-disk hash table for storing trie pages. -type DB struct { - file *os.File - offsets HTOffsets - metaMap *MetaMap - seed [16]byte - capacity uint64 - occupied atomic.Int64 -} - -// Create creates a new Bitbox database at the given path. -// Capacity must be a power of 2. -func Create(path string, capacity uint64, seed [16]byte) (*DB, error) { - if capacity == 0 || capacity&(capacity-1) != 0 { - return nil, fmt.Errorf("bitbox: capacity must be a power of 2") - } - - f, offsets, err := CreateHTFile(path, capacity, seed) - if err != nil { - return nil, err - } - - mm := NewMetaMap(capacity) - - db := &DB{ - file: f, - offsets: offsets, - metaMap: mm, - seed: seed, - capacity: capacity, - } - return db, nil -} - -// Open opens an existing Bitbox database. -func Open(path string) (*DB, error) { - f, offsets, seed, occupied, err := OpenHTFile(path) - if err != nil { - return nil, err - } - - mm, err := LoadMetaMap(f, offsets) - if err != nil { - f.Close() - return nil, err - } - - db := &DB{ - file: f, - offsets: offsets, - metaMap: mm, - seed: seed, - capacity: offsets.Capacity, - } - db.occupied.Store(int64(occupied)) - return db, nil -} - -// Close closes the database file. -func (db *DB) Close() error { - return db.file.Close() -} - -// Seed returns the hash seed. -func (db *DB) Seed() [16]byte { - return db.seed -} - -// Capacity returns the total number of buckets. -func (db *DB) Capacity() uint64 { - return db.capacity -} - -// Occupied returns the number of occupied buckets. -func (db *DB) Occupied() int64 { - return db.occupied.Load() -} - -// LoadPage reads a page from the hash table by probing for its PageID. -// Returns the page, the bucket index where it was found, and whether it exists. -func (db *DB) LoadPage(pageID core.PageID) ( - *core.RawPage, uint64, bool, error, -) { - hash := HashPageID(db.seed, pageID) - probe := NewProbeSequence(hash, db.capacity) - encodedID := pageID.Encode() - - for range db.capacity { - bucket := probe.Bucket() - meta := db.metaMap.Get(bucket) - - if IsEmpty(meta) { - // Definitely not in the table. - return nil, 0, false, nil - } - - if IsTombstone(meta) { - probe.Next() - continue - } - - if !TagMatches(meta, hash) { - probe.Next() - continue - } - - // Tag matches — read the data page to confirm. - page, err := db.readDataPage(bucket) - if err != nil { - return nil, 0, false, err - } - - storedID := page.PageIDBytes() - if storedID == encodedID { - return page, bucket, true, nil - } - - probe.Next() - } - - return nil, 0, false, nil -} - -// StorePage writes a page to the hash table. If the page already exists -// (by probing), it is overwritten in-place. Otherwise, a new bucket is -// allocated. -func (db *DB) StorePage(pageID core.PageID, page *core.RawPage) ( - uint64, error, -) { - // Ensure the encoded PageID is in the page data. - encodedID := pageID.Encode() - page.SetPageIDBytes(encodedID) - - hash := HashPageID(db.seed, pageID) - probe := NewProbeSequence(hash, db.capacity) - metaByte := MakeOccupied(hash) - - var firstTombstone int64 = -1 - - for range db.capacity { - bucket := probe.Bucket() - meta := db.metaMap.Get(bucket) - - if IsEmpty(meta) { - // Use tombstone if we passed one, otherwise use this empty slot. - target := bucket - if firstTombstone >= 0 { - target = uint64(firstTombstone) - } else { - db.occupied.Add(1) - } - db.metaMap.Set(target, metaByte) - if err := db.writeDataPage(target, page); err != nil { - return 0, err - } - return target, nil - } - - if IsTombstone(meta) { - if firstTombstone < 0 { - firstTombstone = int64(bucket) - } - probe.Next() - continue - } - - if TagMatches(meta, hash) { - // Check if this is the same page. - existing, err := db.readDataPage(bucket) - if err != nil { - return 0, err - } - if existing.PageIDBytes() == encodedID { - // Overwrite in-place. - if err := db.writeDataPage(bucket, page); err != nil { - return 0, err - } - return bucket, nil - } - } - - probe.Next() - } - - return 0, fmt.Errorf("bitbox: hash table full") -} - -// DeletePage removes a page from the hash table by setting its meta byte -// to tombstone. -func (db *DB) DeletePage(pageID core.PageID) (bool, error) { - hash := HashPageID(db.seed, pageID) - probe := NewProbeSequence(hash, db.capacity) - encodedID := pageID.Encode() - - for range db.capacity { - bucket := probe.Bucket() - meta := db.metaMap.Get(bucket) - - if IsEmpty(meta) { - return false, nil - } - - if IsTombstone(meta) { - probe.Next() - continue - } - - if TagMatches(meta, hash) { - existing, err := db.readDataPage(bucket) - if err != nil { - return false, err - } - if existing.PageIDBytes() == encodedID { - db.metaMap.Set(bucket, MetaTombstone) - db.occupied.Add(-1) - return true, nil - } - } - - probe.Next() - } - - return false, nil -} - -// FlushMeta writes all dirty meta pages to disk and updates the header. -func (db *DB) FlushMeta() error { - for _, pageIdx := range db.metaMap.DirtyMetaPages() { - if err := db.metaMap.WriteMetaPage(db.file, pageIdx); err != nil { - return err - } - } - db.metaMap.ClearDirty() - - // Update occupied count in header. - var buf [8]byte - occ := max(db.occupied.Load(), 0) - binary.LittleEndian.PutUint64(buf[:], uint64(occ)) - if _, err := db.file.WriteAt(buf[:], occupiedOffset); err != nil { - return fmt.Errorf("bitbox: update occupied count: %w", err) - } - - return nil -} - -// Sync flushes all pending data to disk. -func (db *DB) Sync() error { - if err := db.FlushMeta(); err != nil { - return err - } - return db.file.Sync() -} - -// --- internal I/O --- - -func (db *DB) readDataPage(bucket uint64) (*core.RawPage, error) { - page := new(core.RawPage) - offset := db.offsets.DataPageOffset(bucket) - if _, err := db.file.ReadAt(page[:], offset); err != nil { - return nil, fmt.Errorf("bitbox: read data page at bucket %d: %w", - bucket, err) - } - return page, nil -} - -func (db *DB) writeDataPage(bucket uint64, page *core.RawPage) error { - offset := db.offsets.DataPageOffset(bucket) - if _, err := db.file.WriteAt(page[:], offset); err != nil { - return fmt.Errorf("bitbox: write data page at bucket %d: %w", - bucket, err) - } - return nil -} diff --git a/nomt/bitbox/htfile.go b/nomt/bitbox/htfile.go deleted file mode 100644 index 1dcbbbef0e..0000000000 --- a/nomt/bitbox/htfile.go +++ /dev/null @@ -1,127 +0,0 @@ -// Package bitbox implements an on-disk open-addressing hash table that maps -// PageIDs to 4096-byte pages. It is the storage backend for the NOMT trie. -package bitbox - -import ( - "encoding/binary" - "fmt" - "os" - - "github.com/ethereum/go-ethereum/nomt/core" -) - -const ( - // pageSize is the size of a disk page. - pageSize = core.PageSize // 4096 - - // metaBytesPerPage is the number of meta bytes that fit in one page. - metaBytesPerPage = pageSize - - // headerSize is the size of the HT file header in bytes. - // Layout: [seed 16] [capacity 8] [occupied 8] = 32 bytes, padded to - // one full page. - headerSize = pageSize - - // seedOffset is the offset of the 16-byte seed in the header. - seedOffset = 0 - // capacityOffset is the offset of the 8-byte capacity in the header. - capacityOffset = 16 - // occupiedOffset is the offset of the 8-byte occupied count. - occupiedOffset = 24 -) - -// HTOffsets holds precomputed file offsets for the hash table file layout. -// -// File layout: -// -// [header: 1 page] [meta pages: ceil(capacity/4096)] [data pages: capacity * 4096] -type HTOffsets struct { - // Capacity is the number of buckets in the hash table. - Capacity uint64 - // MetaPages is ceil(Capacity / 4096). - MetaPages uint64 -} - -// NewHTOffsets creates an HTOffsets for the given capacity. -func NewHTOffsets(capacity uint64) HTOffsets { - return HTOffsets{ - Capacity: capacity, - MetaPages: (capacity + metaBytesPerPage - 1) / metaBytesPerPage, - } -} - -// MetaByteOffset returns the file offset for the meta byte of a given bucket. -func (o *HTOffsets) MetaByteOffset(bucket uint64) int64 { - return int64(headerSize) + int64(bucket) -} - -// DataPageOffset returns the file offset for the data page of a given bucket. -func (o *HTOffsets) DataPageOffset(bucket uint64) int64 { - dataStart := int64(headerSize) + int64(o.MetaPages)*pageSize - return dataStart + int64(bucket)*pageSize -} - -// TotalFileSize returns the total size of the HT file in bytes. -func (o *HTOffsets) TotalFileSize() int64 { - return int64(headerSize) + int64(o.MetaPages)*pageSize + - int64(o.Capacity)*pageSize -} - -// CreateHTFile creates a new hash table file with the given capacity and seed. -// The file is pre-allocated to its full size. -func CreateHTFile(path string, capacity uint64, seed [16]byte) ( - *os.File, HTOffsets, error, -) { - offsets := NewHTOffsets(capacity) - - f, err := os.Create(path) - if err != nil { - return nil, offsets, fmt.Errorf("bitbox: create HT file: %w", err) - } - - // Pre-allocate. - totalSize := offsets.TotalFileSize() - if err := f.Truncate(totalSize); err != nil { - f.Close() - return nil, offsets, fmt.Errorf("bitbox: truncate HT file: %w", err) - } - - // Write header. - var header [headerSize]byte - copy(header[seedOffset:], seed[:]) - binary.LittleEndian.PutUint64(header[capacityOffset:], capacity) - binary.LittleEndian.PutUint64(header[occupiedOffset:], 0) - - if _, err := f.WriteAt(header[:], 0); err != nil { - f.Close() - return nil, offsets, fmt.Errorf("bitbox: write header: %w", err) - } - - return f, offsets, nil -} - -// OpenHTFile opens an existing hash table file and reads its header. -func OpenHTFile(path string) ( - *os.File, HTOffsets, [16]byte, uint64, error, -) { - f, err := os.OpenFile(path, os.O_RDWR, 0) - if err != nil { - return nil, HTOffsets{}, [16]byte{}, 0, - fmt.Errorf("bitbox: open HT file: %w", err) - } - - var header [headerSize]byte - if _, err := f.ReadAt(header[:], 0); err != nil { - f.Close() - return nil, HTOffsets{}, [16]byte{}, 0, - fmt.Errorf("bitbox: read header: %w", err) - } - - var seed [16]byte - copy(seed[:], header[seedOffset:seedOffset+16]) - capacity := binary.LittleEndian.Uint64(header[capacityOffset:]) - occupied := binary.LittleEndian.Uint64(header[occupiedOffset:]) - offsets := NewHTOffsets(capacity) - - return f, offsets, seed, occupied, nil -} diff --git a/nomt/bitbox/metamap.go b/nomt/bitbox/metamap.go deleted file mode 100644 index 9261bcb37c..0000000000 --- a/nomt/bitbox/metamap.go +++ /dev/null @@ -1,118 +0,0 @@ -package bitbox - -import ( - "fmt" - "os" -) - -// Meta byte constants. -const ( - // MetaEmpty marks an empty bucket. - MetaEmpty byte = 0x00 - // MetaTombstone marks a deleted bucket (still probed through). - MetaTombstone byte = 0x7F -) - -// IsOccupied reports whether a meta byte indicates an occupied bucket. -// Occupied bytes have bit 7 set (value >= 0x80). -func IsOccupied(b byte) bool { - return b&0x80 != 0 -} - -// IsEmpty reports whether a meta byte indicates an empty bucket. -func IsEmpty(b byte) bool { - return b == MetaEmpty -} - -// IsTombstone reports whether a meta byte indicates a tombstone. -func IsTombstone(b byte) bool { - return b == MetaTombstone -} - -// MakeOccupied creates an occupied meta byte from a hash value. -// It takes the top 7 bits of the hash and sets bit 7 to 1. -func MakeOccupied(hash uint64) byte { - return 0x80 | byte(hash>>57) -} - -// TagMatches reports whether an occupied meta byte could match a given hash. -func TagMatches(metaByte byte, hash uint64) bool { - return IsOccupied(metaByte) && metaByte == MakeOccupied(hash) -} - -// MetaMap holds an in-memory copy of all meta bytes for the hash table. -type MetaMap struct { - data []byte - dirty []bool // per meta-page dirty tracking -} - -// NewMetaMap creates a MetaMap for the given capacity with all empty buckets. -func NewMetaMap(capacity uint64) *MetaMap { - metaPages := (capacity + metaBytesPerPage - 1) / metaBytesPerPage - return &MetaMap{ - data: make([]byte, capacity), - dirty: make([]bool, metaPages), - } -} - -// LoadMetaMap reads all meta bytes from the HT file into memory. -func LoadMetaMap(f *os.File, offsets HTOffsets) (*MetaMap, error) { - mm := NewMetaMap(offsets.Capacity) - - // Read all meta bytes at once. - metaRegionSize := int64(offsets.MetaPages) * pageSize - buf := make([]byte, metaRegionSize) - if _, err := f.ReadAt(buf, int64(headerSize)); err != nil { - return nil, fmt.Errorf("bitbox: load meta map: %w", err) - } - - // Copy only the capacity-many bytes (the rest is padding). - copy(mm.data, buf[:offsets.Capacity]) - return mm, nil -} - -// Get returns the meta byte for a bucket. -func (m *MetaMap) Get(bucket uint64) byte { - return m.data[bucket] -} - -// Set writes a meta byte for a bucket and marks the containing page dirty. -func (m *MetaMap) Set(bucket uint64, value byte) { - m.data[bucket] = value - m.dirty[bucket/metaBytesPerPage] = true -} - -// DirtyMetaPages returns the indices of meta pages that have been modified -// since the last call to ClearDirty. -func (m *MetaMap) DirtyMetaPages() []uint64 { - pages := make([]uint64, 0, len(m.dirty)) - for i, d := range m.dirty { - if d { - pages = append(pages, uint64(i)) - } - } - return pages -} - -// ClearDirty resets all dirty flags. -func (m *MetaMap) ClearDirty() { - for i := range m.dirty { - m.dirty[i] = false - } -} - -// WriteMetaPage writes a single meta page (identified by index) to the file. -func (m *MetaMap) WriteMetaPage( - f *os.File, pageIdx uint64, -) error { - var buf [pageSize]byte - start := pageIdx * metaBytesPerPage - end := min(start+metaBytesPerPage, uint64(len(m.data))) - copy(buf[:], m.data[start:end]) - - offset := int64(headerSize) + int64(pageIdx)*pageSize - if _, err := f.WriteAt(buf[:], offset); err != nil { - return fmt.Errorf("bitbox: write meta page %d: %w", pageIdx, err) - } - return nil -} diff --git a/nomt/bitbox/probe.go b/nomt/bitbox/probe.go deleted file mode 100644 index 6ba6c7685b..0000000000 --- a/nomt/bitbox/probe.go +++ /dev/null @@ -1,81 +0,0 @@ -package bitbox - -import ( - "encoding/binary" - - "github.com/cespare/xxhash/v2" - "github.com/ethereum/go-ethereum/nomt/core" -) - -// HashPageID computes the xxhash64 of seed||encodedPageID. -func HashPageID(seed [16]byte, pageID core.PageID) uint64 { - encoded := pageID.Encode() - var buf [48]byte - copy(buf[:16], seed[:]) - copy(buf[16:], encoded[:]) - return xxhash.Sum64(buf[:]) -} - -// HashPageIDBytes computes the xxhash64 from seed and raw encoded page ID. -func HashPageIDBytes(seed [16]byte, encodedPageID [32]byte) uint64 { - var buf [48]byte - copy(buf[:16], seed[:]) - copy(buf[16:], encodedPageID[:]) - return xxhash.Sum64(buf[:]) -} - -// HashSeedFromBytes creates a [16]byte seed from a byte slice. -func HashSeedFromBytes(b []byte) [16]byte { - var seed [16]byte - copy(seed[:], b) - return seed -} - -// HashSeedFromUint64 creates a deterministic seed from two uint64 values. -func HashSeedFromUint64(a, b uint64) [16]byte { - var seed [16]byte - binary.LittleEndian.PutUint64(seed[:8], a) - binary.LittleEndian.PutUint64(seed[8:], b) - return seed -} - -// ProbeSequence implements triangular probing over the hash table. -// -// Bucket(step) = (initial + step*(step+1)/2) mod capacity -// -// With a power-of-2 capacity, triangular probing visits every bucket before -// repeating, guaranteeing termination. -type ProbeSequence struct { - hash uint64 - bucket uint64 - step uint64 - capacity uint64 -} - -// NewProbeSequence creates a new probe sequence for the given hash and -// capacity. The capacity MUST be a power of 2. -func NewProbeSequence(hash, capacity uint64) ProbeSequence { - initial := hash % capacity - return ProbeSequence{ - hash: hash, - bucket: initial, - step: 0, - capacity: capacity, - } -} - -// Bucket returns the current bucket index. -func (p *ProbeSequence) Bucket() uint64 { - return p.bucket -} - -// Hash returns the hash used to seed this probe. -func (p *ProbeSequence) Hash() uint64 { - return p.hash -} - -// Next advances to the next bucket in the triangular probe sequence. -func (p *ProbeSequence) Next() { - p.step++ - p.bucket = (p.bucket + p.step) % p.capacity -} diff --git a/nomt/bitbox/recover.go b/nomt/bitbox/recover.go deleted file mode 100644 index fb31ca91e6..0000000000 --- a/nomt/bitbox/recover.go +++ /dev/null @@ -1,73 +0,0 @@ -package bitbox - -import ( - "fmt" - - "github.com/ethereum/go-ethereum/nomt/core" -) - -// Recover replays the WAL file to restore the database to a consistent state. -// Returns the sync sequence number from the WAL, or 0 if no recovery was -// needed. -func (db *DB) Recover(walPath string) (uint32, error) { - data, err := ReadWALFile(walPath) - if err != nil { - return 0, fmt.Errorf("bitbox/recover: %w", err) - } - - if len(data) == 0 { - return 0, nil // No recovery needed. - } - - syncSeqn, entries, err := ReadWAL(data) - if err != nil { - return 0, fmt.Errorf("bitbox/recover: parse: %w", err) - } - - for _, entry := range entries { - switch entry.Kind { - case WALEntryClear: - db.metaMap.Set(entry.ClearBucket, MetaTombstone) - - case WALEntryUpdate: - // Read the existing data page at this bucket (or use a fresh one). - page, readErr := db.readDataPage(entry.UpdateBucket) - if readErr != nil { - page = new(core.RawPage) - } - - // Apply the diff: unpack changed nodes into the page. - entry.Diff.UnpackChangedNodes(entry.ChangedNodes, page) - - // Set elided children and page ID. - page.SetElidedChildren(entry.ElidedChildren) - page.SetPageIDBytes(entry.PageID) - - // Write data page. - if err := db.writeDataPage(entry.UpdateBucket, page); err != nil { - return 0, fmt.Errorf("bitbox/recover: write page: %w", err) - } - - // Update meta byte. - hash := HashPageIDBytes(db.seed, entry.PageID) - db.metaMap.Set(entry.UpdateBucket, MakeOccupied(hash)) - } - } - - // Write dirty meta pages. - if err := db.FlushMeta(); err != nil { - return 0, fmt.Errorf("bitbox/recover: flush meta: %w", err) - } - - // fsync HT file. - if err := db.file.Sync(); err != nil { - return 0, fmt.Errorf("bitbox/recover: fsync: %w", err) - } - - // Truncate WAL. - if err := TruncateWALFile(walPath); err != nil { - return 0, fmt.Errorf("bitbox/recover: truncate WAL: %w", err) - } - - return syncSeqn, nil -} diff --git a/nomt/bitbox/sync.go b/nomt/bitbox/sync.go deleted file mode 100644 index dcd76acad8..0000000000 --- a/nomt/bitbox/sync.go +++ /dev/null @@ -1,131 +0,0 @@ -package bitbox - -import ( - "fmt" - - "github.com/ethereum/go-ethereum/nomt/core" - "github.com/ethereum/go-ethereum/nomt/merkle" -) - -// SyncPlan holds the pre-computed work for a sync operation. -type SyncPlan struct { - walData []byte - dataWrites []dataWrite - syncSeqn uint32 -} - -type dataWrite struct { - bucket uint64 - page *core.RawPage -} - -// BeginSync prepares a sync plan from a set of updated pages. It allocates -// or reuses buckets, builds the WAL, and returns a SyncPlan. -// -// This is Phase 1 of the 3-phase sync protocol. -func (db *DB) BeginSync( - walPath string, - syncSeqn uint32, - updates []merkle.UpdatedPage, -) (*SyncPlan, error) { - wal := NewWALBuilder() - writes := make([]dataWrite, 0, len(updates)) - - for _, up := range updates { - if up.Diff.IsCleared() { - // Page was cleared — tombstone its bucket. - _, bucket, found, err := db.LoadPage(up.PageID) - if err != nil { - return nil, fmt.Errorf("bitbox/sync: load for clear: %w", err) - } - if found { - db.metaMap.Set(bucket, MetaTombstone) - db.occupied.Add(-1) - wal.AddClear(bucket) - } - continue - } - - // Encode the PageID into the page data. - encodedID := up.PageID.Encode() - up.Page.SetPageIDBytes(encodedID) - up.Page.SetElidedChildren(up.Page.ElidedChildren()) - - // Allocate or reuse a bucket. - bucket, err := db.StorePage(up.PageID, up.Page) - if err != nil { - return nil, fmt.Errorf("bitbox/sync: store page: %w", err) - } - - // Pack changed nodes from diff. - changedNodes := up.Diff.PackChangedNodes(up.Page) - - wal.AddUpdate( - encodedID, - up.Diff, - changedNodes, - up.Page.ElidedChildren(), - bucket, - ) - writes = append(writes, dataWrite{bucket: bucket, page: up.Page}) - } - - walData := wal.Finish(syncSeqn) - - return &SyncPlan{ - walData: walData, - dataWrites: writes, - syncSeqn: syncSeqn, - }, nil -} - -// WriteWAL writes the WAL to disk and fsyncs it. -// -// This is Phase 2 of the 3-phase sync protocol. -func (db *DB) WriteWAL(walPath string, plan *SyncPlan) error { - return WriteWALFile(walPath, plan.walData) -} - -// CommitSync writes dirty HT data + meta pages, fsyncs the HT file, and -// truncates the WAL. -// -// This is Phase 3 of the 3-phase sync protocol. -func (db *DB) CommitSync(walPath string, plan *SyncPlan) error { - // Write data pages. - for _, dw := range plan.dataWrites { - if err := db.writeDataPage(dw.bucket, dw.page); err != nil { - return fmt.Errorf("bitbox/sync: write data: %w", err) - } - } - - // Write dirty meta pages. - if err := db.FlushMeta(); err != nil { - return fmt.Errorf("bitbox/sync: flush meta: %w", err) - } - - // fsync the HT file. - if err := db.file.Sync(); err != nil { - return fmt.Errorf("bitbox/sync: fsync HT: %w", err) - } - - // Truncate WAL — no fsync needed. - return TruncateWALFile(walPath) -} - -// FullSync runs all three phases of the sync protocol. -func (db *DB) FullSync( - walPath string, - syncSeqn uint32, - updates []merkle.UpdatedPage, -) error { - plan, err := db.BeginSync(walPath, syncSeqn, updates) - if err != nil { - return err - } - - if err := db.WriteWAL(walPath, plan); err != nil { - return err - } - - return db.CommitSync(walPath, plan) -} diff --git a/nomt/bitbox/wal.go b/nomt/bitbox/wal.go deleted file mode 100644 index 8b5322c1a1..0000000000 --- a/nomt/bitbox/wal.go +++ /dev/null @@ -1,296 +0,0 @@ -package bitbox - -import ( - "encoding/binary" - "fmt" - "os" - - "github.com/ethereum/go-ethereum/nomt/core" -) - -// WAL entry type tags. -const ( - walTagStart byte = 0x01 - walTagClear byte = 0x02 - walTagUpdate byte = 0x03 - walTagEnd byte = 0x04 -) - -// WALEntryKind distinguishes the types of WAL entries. -type WALEntryKind int - -const ( - WALEntryClear WALEntryKind = iota - WALEntryUpdate -) - -// WALEntry represents a single entry in the WAL. -type WALEntry struct { - Kind WALEntryKind - - // For Clear entries: - ClearBucket uint64 - - // For Update entries: - PageID [32]byte - Diff core.PageDiff - ChangedNodes []core.Node - ElidedChildren uint64 - UpdateBucket uint64 -} - -// WALBuilder accumulates WAL entries in memory before serializing. -type WALBuilder struct { - entries []WALEntry -} - -// NewWALBuilder creates an empty WAL builder. -func NewWALBuilder() *WALBuilder { - return &WALBuilder{ - entries: make([]WALEntry, 0, 64), - } -} - -// AddClear adds a CLEAR entry (tombstone a bucket). -func (b *WALBuilder) AddClear(bucket uint64) { - b.entries = append(b.entries, WALEntry{ - Kind: WALEntryClear, - ClearBucket: bucket, - }) -} - -// AddUpdate adds an UPDATE entry. -func (b *WALBuilder) AddUpdate( - pageID [32]byte, - diff core.PageDiff, - changedNodes []core.Node, - elidedChildren uint64, - bucket uint64, -) { - b.entries = append(b.entries, WALEntry{ - Kind: WALEntryUpdate, - PageID: pageID, - Diff: diff, - ChangedNodes: changedNodes, - ElidedChildren: elidedChildren, - UpdateBucket: bucket, - }) -} - -// Finish serializes the WAL with a START and END record, padded to a -// multiple of pageSize. -func (b *WALBuilder) Finish(syncSeqn uint32) []byte { - // Estimate size: START(5) + entries + END(1). - estimatedSize := 5 + 1 - for _, e := range b.entries { - switch e.Kind { - case WALEntryClear: - estimatedSize += 1 + 8 // tag + bucket - case WALEntryUpdate: - estimatedSize += 1 + 32 + 16 + len(e.ChangedNodes)*32 + 8 + 8 - } - } - - buf := make([]byte, 0, estimatedSize+pageSize) - - // START: tag(1) + syncSeqn(4) - buf = append(buf, walTagStart) - var seqBuf [4]byte - binary.LittleEndian.PutUint32(seqBuf[:], syncSeqn) - buf = append(buf, seqBuf[:]...) - - // Entries. - var u64Buf [8]byte - for _, e := range b.entries { - switch e.Kind { - case WALEntryClear: - buf = append(buf, walTagClear) - binary.LittleEndian.PutUint64(u64Buf[:], e.ClearBucket) - buf = append(buf, u64Buf[:]...) - - case WALEntryUpdate: - buf = append(buf, walTagUpdate) - buf = append(buf, e.PageID[:]...) - encoded := e.Diff.Encode() - buf = append(buf, encoded[:]...) - for _, n := range e.ChangedNodes { - buf = append(buf, n[:]...) - } - binary.LittleEndian.PutUint64(u64Buf[:], e.ElidedChildren) - buf = append(buf, u64Buf[:]...) - binary.LittleEndian.PutUint64(u64Buf[:], e.UpdateBucket) - buf = append(buf, u64Buf[:]...) - } - } - - // END tag. - buf = append(buf, walTagEnd) - - // Pad to page boundary. - if rem := len(buf) % pageSize; rem != 0 { - padding := make([]byte, pageSize-rem) - buf = append(buf, padding...) - } - - return buf -} - -// ReadWAL parses a WAL from raw bytes. Returns the sync sequence number -// and the list of entries. Returns an error if the WAL is malformed. -func ReadWAL(data []byte) (uint32, []WALEntry, error) { - if len(data) == 0 { - return 0, nil, nil // Empty WAL = no recovery needed. - } - - pos := 0 - read := func(n int) ([]byte, error) { - if pos+n > len(data) { - return nil, fmt.Errorf("bitbox/wal: unexpected EOF at offset %d", pos) - } - b := data[pos : pos+n] - pos += n - return b, nil - } - - readByte := func() (byte, error) { - b, err := read(1) - if err != nil { - return 0, err - } - return b[0], nil - } - - readU32 := func() (uint32, error) { - b, err := read(4) - if err != nil { - return 0, err - } - return binary.LittleEndian.Uint32(b), nil - } - - readU64 := func() (uint64, error) { - b, err := read(8) - if err != nil { - return 0, err - } - return binary.LittleEndian.Uint64(b), nil - } - - // Read START. - tag, err := readByte() - if err != nil { - return 0, nil, err - } - if tag != walTagStart { - return 0, nil, fmt.Errorf("bitbox/wal: expected START tag, got 0x%02x", tag) - } - - syncSeqn, err := readU32() - if err != nil { - return 0, nil, err - } - - var entries []WALEntry - - for { - tag, err := readByte() - if err != nil { - return 0, nil, err - } - - switch tag { - case walTagEnd: - return syncSeqn, entries, nil - - case walTagClear: - bucket, err := readU64() - if err != nil { - return 0, nil, err - } - entries = append(entries, WALEntry{ - Kind: WALEntryClear, - ClearBucket: bucket, - }) - - case walTagUpdate: - pidBytes, err := read(32) - if err != nil { - return 0, nil, err - } - var pageID [32]byte - copy(pageID[:], pidBytes) - - diffBytes, err := read(16) - if err != nil { - return 0, nil, err - } - var diffBuf [16]byte - copy(diffBuf[:], diffBytes) - diff := core.DecodePageDiff(diffBuf) - - nodeCount := diff.Count() - nodes := make([]core.Node, nodeCount) - for i := range nodeCount { - nodeBytes, err := read(32) - if err != nil { - return 0, nil, err - } - copy(nodes[i][:], nodeBytes) - } - - elidedChildren, err := readU64() - if err != nil { - return 0, nil, err - } - bucket, err := readU64() - if err != nil { - return 0, nil, err - } - - entries = append(entries, WALEntry{ - Kind: WALEntryUpdate, - PageID: pageID, - Diff: diff, - ChangedNodes: nodes, - ElidedChildren: elidedChildren, - UpdateBucket: bucket, - }) - - default: - return 0, nil, fmt.Errorf("bitbox/wal: unknown tag 0x%02x at offset %d", - tag, pos-1) - } - } -} - -// WriteWALFile writes a WAL to a file, creating or truncating it. -func WriteWALFile(path string, data []byte) error { - if err := os.WriteFile(path, data, 0644); err != nil { - return fmt.Errorf("bitbox/wal: write: %w", err) - } - // fsync via re-open. - f, err := os.Open(path) - if err != nil { - return fmt.Errorf("bitbox/wal: open for sync: %w", err) - } - defer f.Close() - return f.Sync() -} - -// ReadWALFile reads a WAL file. Returns nil data if the file doesn't exist -// or is empty. -func ReadWALFile(path string) ([]byte, error) { - data, err := os.ReadFile(path) - if err != nil { - if os.IsNotExist(err) { - return nil, nil - } - return nil, fmt.Errorf("bitbox/wal: read: %w", err) - } - return data, nil -} - -// TruncateWALFile empties the WAL file. -func TruncateWALFile(path string) error { - return os.Truncate(path, 0) -} diff --git a/nomt/bitbox/wal_test.go b/nomt/bitbox/wal_test.go deleted file mode 100644 index da01d7f820..0000000000 --- a/nomt/bitbox/wal_test.go +++ /dev/null @@ -1,217 +0,0 @@ -package bitbox - -import ( - "path/filepath" - "testing" - - "github.com/ethereum/go-ethereum/nomt/core" - "github.com/ethereum/go-ethereum/nomt/merkle" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -// --- WAL Builder/Reader Tests --- - -func TestWALEmptyRoundTrip(t *testing.T) { - b := NewWALBuilder() - data := b.Finish(42) - - // Should be padded to page boundary. - assert.Equal(t, 0, len(data)%pageSize) - - seqn, entries, err := ReadWAL(data) - require.NoError(t, err) - assert.Equal(t, uint32(42), seqn) - assert.Empty(t, entries) -} - -func TestWALClearEntryRoundTrip(t *testing.T) { - b := NewWALBuilder() - b.AddClear(123) - b.AddClear(456) - data := b.Finish(1) - - seqn, entries, err := ReadWAL(data) - require.NoError(t, err) - assert.Equal(t, uint32(1), seqn) - require.Len(t, entries, 2) - - assert.Equal(t, WALEntryClear, entries[0].Kind) - assert.Equal(t, uint64(123), entries[0].ClearBucket) - assert.Equal(t, uint64(456), entries[1].ClearBucket) -} - -func TestWALUpdateEntryRoundTrip(t *testing.T) { - var pageID [32]byte - pageID[0] = 0xAB - - var diff core.PageDiff - diff.SetChanged(5) - diff.SetChanged(70) - - nodes := []core.Node{{0x01}, {0x02}} - - b := NewWALBuilder() - b.AddUpdate(pageID, diff, nodes, 0xFF, 99) - data := b.Finish(7) - - seqn, entries, err := ReadWAL(data) - require.NoError(t, err) - assert.Equal(t, uint32(7), seqn) - require.Len(t, entries, 1) - - e := entries[0] - assert.Equal(t, WALEntryUpdate, e.Kind) - assert.Equal(t, pageID, e.PageID) - assert.True(t, e.Diff.IsChanged(5)) - assert.True(t, e.Diff.IsChanged(70)) - require.Len(t, e.ChangedNodes, 2) - assert.Equal(t, core.Node{0x01}, e.ChangedNodes[0]) - assert.Equal(t, core.Node{0x02}, e.ChangedNodes[1]) - assert.Equal(t, uint64(0xFF), e.ElidedChildren) - assert.Equal(t, uint64(99), e.UpdateBucket) -} - -func TestWALMixedEntries(t *testing.T) { - b := NewWALBuilder() - b.AddClear(10) - - var pid [32]byte - var diff core.PageDiff - diff.SetChanged(0) - b.AddUpdate(pid, diff, []core.Node{{0xAA}}, 0, 20) - - b.AddClear(30) - data := b.Finish(100) - - _, entries, err := ReadWAL(data) - require.NoError(t, err) - require.Len(t, entries, 3) - assert.Equal(t, WALEntryClear, entries[0].Kind) - assert.Equal(t, WALEntryUpdate, entries[1].Kind) - assert.Equal(t, WALEntryClear, entries[2].Kind) -} - -func TestReadWALEmpty(t *testing.T) { - seqn, entries, err := ReadWAL(nil) - require.NoError(t, err) - assert.Equal(t, uint32(0), seqn) - assert.Nil(t, entries) -} - -func TestWALFilePersistence(t *testing.T) { - dir := t.TempDir() - path := filepath.Join(dir, "test.wal") - - b := NewWALBuilder() - b.AddClear(42) - data := b.Finish(5) - - require.NoError(t, WriteWALFile(path, data)) - - loaded, err := ReadWALFile(path) - require.NoError(t, err) - assert.Equal(t, data, loaded) - - require.NoError(t, TruncateWALFile(path)) - loaded2, err := ReadWALFile(path) - require.NoError(t, err) - assert.Empty(t, loaded2) -} - -// --- Sync Controller Tests --- - -func TestFullSyncCycle(t *testing.T) { - dir := t.TempDir() - htPath := filepath.Join(dir, "test.bitbox") - walPath := filepath.Join(dir, "test.wal") - - seed := HashSeedFromUint64(1, 2) - db, err := Create(htPath, 1024, seed) - require.NoError(t, err) - defer db.Close() - - rootID := core.RootPageID() - page := new(core.RawPage) - page.SetNodeAt(0, core.Node{0xAA}) - - var diff core.PageDiff - diff.SetChanged(0) - - updates := []merkle.UpdatedPage{{ - PageID: rootID, - Page: page, - Diff: diff, - }} - - require.NoError(t, db.FullSync(walPath, 1, updates)) - - // Verify page is persisted. - loaded, _, found, err := db.LoadPage(rootID) - require.NoError(t, err) - assert.True(t, found) - assert.Equal(t, core.Node{0xAA}, loaded.NodeAt(0)) -} - -// --- Recovery Tests --- - -func TestRecoverFromWAL(t *testing.T) { - dir := t.TempDir() - htPath := filepath.Join(dir, "test.bitbox") - walPath := filepath.Join(dir, "test.wal") - - seed := HashSeedFromUint64(1, 2) - - // Create DB and write a WAL but don't commit Phase 3. - db, err := Create(htPath, 1024, seed) - require.NoError(t, err) - - rootID := core.RootPageID() - page := new(core.RawPage) - page.SetNodeAt(0, core.Node{0xBB}) - - var diff core.PageDiff - diff.SetChanged(0) - - updates := []merkle.UpdatedPage{{ - PageID: rootID, - Page: page, - Diff: diff, - }} - - // Phase 1 + 2 only (simulate crash before Phase 3). - plan, err := db.BeginSync(walPath, 5, updates) - require.NoError(t, err) - require.NoError(t, db.WriteWAL(walPath, plan)) - db.Close() - - // Reopen and recover. - db2, err := Open(htPath) - require.NoError(t, err) - defer db2.Close() - - seqn, err := db2.Recover(walPath) - require.NoError(t, err) - assert.Equal(t, uint32(5), seqn) - - // Verify the page was recovered. - loaded, _, found, err := db2.LoadPage(rootID) - require.NoError(t, err) - assert.True(t, found) - assert.Equal(t, core.Node{0xBB}, loaded.NodeAt(0)) -} - -func TestRecoverNoWAL(t *testing.T) { - dir := t.TempDir() - htPath := filepath.Join(dir, "test.bitbox") - walPath := filepath.Join(dir, "test.wal") - - seed := HashSeedFromUint64(1, 2) - db, err := Create(htPath, 1024, seed) - require.NoError(t, err) - defer db.Close() - - seqn, err := db.Recover(walPath) - require.NoError(t, err) - assert.Equal(t, uint32(0), seqn, "no recovery needed") -} diff --git a/nomt/db/db.go b/nomt/db/db.go index 4a8079f23e..0ea2925b0e 100644 --- a/nomt/db/db.go +++ b/nomt/db/db.go @@ -1,35 +1,37 @@ -// Package db provides the unified NOMT trie database combining Bitbox +// Package db provides the NOMT trie database combining PebbleDB page // storage with the PageWalker merkle engine. // -// This package handles only the trie structure (merkle pages). Flat -// key-value storage (accounts, storage slots) stays on geth's PebbleDB. +// Trie pages are stored as 4KB blobs in geth's ethdb under key prefix 0x04. +// Flat key-value storage (accounts, storage slots) stays on geth's PebbleDB +// under separate prefixes managed by triedb/nomtdb. package db import ( "bytes" - "crypto/rand" "fmt" - "os" - "path/filepath" "runtime" "sort" "sync" - "github.com/ethereum/go-ethereum/nomt/bitbox" + "github.com/ethereum/go-ethereum/ethdb" "github.com/ethereum/go-ethereum/nomt/core" "github.com/ethereum/go-ethereum/nomt/merkle" ) const ( - htFileName = "nomt.ht" - walFileName = "nomt.wal" + // nomtPagePrefix is the ethdb key prefix for NOMT trie pages. + // Key format: 0x04 || PageID.Encode()[32] → RawPage[4032] + nomtPagePrefix byte = 0x04 + + // nomtMetaPrefix is the ethdb key prefix for NOMT metadata. + nomtMetaPrefix byte = 0x05 ) +// nomtMetaRootKey is the ethdb key for the persisted page tree root. +var nomtMetaRootKey = []byte{nomtMetaPrefix, 'r', 'o', 'o', 't'} + // Config holds configuration for the NOMT database. type Config struct { - // HTCapacity is the number of hash table buckets. Must be a power of 2. - HTCapacity uint64 - // NumWorkers is the number of parallel goroutines for trie updates. // Defaults to runtime.NumCPU() if zero. NumWorkers int @@ -37,71 +39,34 @@ type Config struct { // DefaultConfig returns a default configuration. func DefaultConfig() Config { - return Config{ - HTCapacity: 1 << 20, // ~1M buckets = ~4GB - } + return Config{} } // DB is the NOMT trie database. type DB struct { - dataDir string - bb *bitbox.DB + diskdb ethdb.Database root core.Node - syncSeqn uint32 numWorkers int mu sync.RWMutex } -// Open opens or creates a NOMT trie database at the given directory. -func Open(dataDir string, config Config) (*DB, error) { - if err := os.MkdirAll(dataDir, 0755); err != nil { - return nil, fmt.Errorf("nomt/db: create datadir: %w", err) - } - - htPath := filepath.Join(dataDir, htFileName) - walPath := filepath.Join(dataDir, walFileName) - - var bb *bitbox.DB - var err error - - if _, statErr := os.Stat(htPath); os.IsNotExist(statErr) { - // Create new database. - var seed [16]byte - if _, err := rand.Read(seed[:]); err != nil { - return nil, fmt.Errorf("nomt/db: generate seed: %w", err) - } - bb, err = bitbox.Create(htPath, config.HTCapacity, seed) - if err != nil { - return nil, fmt.Errorf("nomt/db: create bitbox: %w", err) - } - } else { - // Open existing database. - bb, err = bitbox.Open(htPath) - if err != nil { - return nil, fmt.Errorf("nomt/db: open bitbox: %w", err) - } - } - +// New creates or opens a NOMT trie database backed by the given ethdb. +// The page tree root is loaded from persisted metadata if available. +func New(diskdb ethdb.Database, config Config) (*DB, error) { numWorkers := config.NumWorkers if numWorkers <= 0 { numWorkers = runtime.NumCPU() } db := &DB{ - dataDir: dataDir, - bb: bb, + diskdb: diskdb, root: core.Terminator, numWorkers: numWorkers, } - // Run WAL recovery. - seqn, err := bb.Recover(walPath) - if err != nil { - bb.Close() - return nil, fmt.Errorf("nomt/db: recover: %w", err) - } - if seqn > 0 { - db.syncSeqn = seqn + // Load persisted root. + if data, err := diskdb.Get(nomtMetaRootKey); err == nil && len(data) == 32 { + copy(db.root[:], data) } return db, nil @@ -114,20 +79,6 @@ func (db *DB) Root() core.Node { return db.root } -// SetRoot sets the current trie root (used when loading state from metadata). -func (db *DB) SetRoot(root core.Node) { - db.mu.Lock() - defer db.mu.Unlock() - db.root = root -} - -// SyncSeqn returns the current sync sequence number. -func (db *DB) SyncSeqn() uint32 { - db.mu.RLock() - defer db.mu.RUnlock() - return db.syncSeqn -} - // Update applies a batch of stem key-value pairs to the trie. // The pairs are sorted internally before processing. func (db *DB) Update(ops []core.StemKeyValue) (core.Node, error) { @@ -148,58 +99,76 @@ func (db *DB) UpdateSorted(ops []core.StemKeyValue) (core.Node, error) { defer db.mu.Unlock() pageSetFactory := func() merkle.PageSet { - return newBitboxPageSet(db.bb) + return newPebblePageSet(db.diskdb) } out := merkle.ParallelUpdate(db.root, ops, db.numWorkers, pageSetFactory) - // Persist updated pages. - walPath := filepath.Join(db.dataDir, walFileName) - db.syncSeqn++ - if err := db.bb.FullSync(walPath, db.syncSeqn, out.Pages); err != nil { - return core.Terminator, fmt.Errorf("nomt/db: sync: %w", err) + // Persist updated pages via atomic batch write. + batch := db.diskdb.NewBatch() + for _, up := range out.Pages { + key := nomtPageKey(up.PageID) + if up.Diff.IsCleared() { + if err := batch.Delete(key); err != nil { + return core.Terminator, fmt.Errorf("nomt/db: delete page: %w", err) + } + } else { + if err := batch.Put(key, up.Page[:]); err != nil { + return core.Terminator, fmt.Errorf("nomt/db: put page: %w", err) + } + } + } + // Persist root. + if err := batch.Put(nomtMetaRootKey, out.Root[:]); err != nil { + return core.Terminator, fmt.Errorf("nomt/db: put root: %w", err) + } + if err := batch.Write(); err != nil { + return core.Terminator, fmt.Errorf("nomt/db: batch write: %w", err) } db.root = out.Root return out.Root, nil } -// LoadPage loads a page from Bitbox storage by its PageID. +// LoadPage loads a page from ethdb storage by its PageID. func (db *DB) LoadPage(pageID core.PageID) (*core.RawPage, error) { - page, _, found, err := db.bb.LoadPage(pageID) + data, err := db.diskdb.Get(nomtPageKey(pageID)) if err != nil { - return nil, fmt.Errorf("nomt/db: load page: %w", err) + return nil, nil // Not found. } - if !found { - return nil, nil + if len(data) != core.PageSize { + return nil, fmt.Errorf("nomt/db: page size mismatch: got %d, want %d", len(data), core.PageSize) } + page := new(core.RawPage) + copy(page[:], data) return page, nil } -// Close closes the database. +// Close is a no-op — the ethdb lifecycle is managed by the caller. func (db *DB) Close() error { - return db.bb.Close() + return nil } -// --- BitboxPageSet --- +// --- PebblePageSet --- -// bitboxPageSet implements merkle.PageSet backed by Bitbox disk storage. -type bitboxPageSet struct { - bb *bitbox.DB - cache map[string]*core.RawPage +// pebblePageSet implements merkle.PageSet backed by ethdb (PebbleDB). +type pebblePageSet struct { + diskdb ethdb.Database + cache map[string]*core.RawPage } -func newBitboxPageSet(bb *bitbox.DB) *bitboxPageSet { - return &bitboxPageSet{ - bb: bb, - cache: make(map[string]*core.RawPage, 16), +func newPebblePageSet(diskdb ethdb.Database) *pebblePageSet { + return &pebblePageSet{ + diskdb: diskdb, + cache: make(map[string]*core.RawPage, 16), } } -func (ps *bitboxPageSet) Get(pageID core.PageID) ( +func (ps *pebblePageSet) Get(pageID core.PageID) ( *core.RawPage, merkle.PageOrigin, bool, ) { - key := pageIDKey(pageID) + key := pageIDCacheKey(pageID) if cached, ok := ps.cache[key]; ok { + // Return a copy so the walker can mutate freely. pageCopy := new(core.RawPage) *pageCopy = *cached return pageCopy, merkle.PageOrigin{ @@ -207,8 +176,8 @@ func (ps *bitboxPageSet) Get(pageID core.PageID) ( }, true } - page, _, found, err := ps.bb.LoadPage(pageID) - if err != nil || !found { + data, err := ps.diskdb.Get(nomtPageKey(pageID)) + if err != nil || len(data) != core.PageSize { // Return a fresh page if not found — this handles the case // where the trie is being built from scratch or expanded // into new regions. @@ -216,7 +185,11 @@ func (ps *bitboxPageSet) Get(pageID core.PageID) ( return fresh, merkle.PageOrigin{Kind: merkle.PageOriginFresh}, true } + page := new(core.RawPage) + copy(page[:], data) ps.cache[key] = page + + // Return a copy so the walker can mutate freely. pageCopy := new(core.RawPage) *pageCopy = *page return pageCopy, merkle.PageOrigin{ @@ -224,26 +197,36 @@ func (ps *bitboxPageSet) Get(pageID core.PageID) ( }, true } -func (ps *bitboxPageSet) Contains(pageID core.PageID) bool { - key := pageIDKey(pageID) +func (ps *pebblePageSet) Contains(pageID core.PageID) bool { + key := pageIDCacheKey(pageID) if _, ok := ps.cache[key]; ok { return true } - _, _, found, _ := ps.bb.LoadPage(pageID) - return found + has, _ := ps.diskdb.Has(nomtPageKey(pageID)) + return has } -func (ps *bitboxPageSet) Fresh(pageID core.PageID) *core.RawPage { +func (ps *pebblePageSet) Fresh(pageID core.PageID) *core.RawPage { return new(core.RawPage) } -func (ps *bitboxPageSet) Insert( +func (ps *pebblePageSet) Insert( pageID core.PageID, page *core.RawPage, origin merkle.PageOrigin, ) { - ps.cache[pageIDKey(pageID)] = page + ps.cache[pageIDCacheKey(pageID)] = page } -func pageIDKey(id core.PageID) string { +// nomtPageKey builds the ethdb key for a NOMT trie page. +func nomtPageKey(id core.PageID) []byte { + encoded := id.Encode() + key := make([]byte, 1+len(encoded)) + key[0] = nomtPagePrefix + copy(key[1:], encoded[:]) + return key +} + +// pageIDCacheKey returns a string key for the in-memory cache. +func pageIDCacheKey(id core.PageID) string { encoded := id.Encode() return string(encoded[:]) } diff --git a/nomt/db/db_test.go b/nomt/db/db_test.go index ba00761eb3..021c6de545 100644 --- a/nomt/db/db_test.go +++ b/nomt/db/db_test.go @@ -3,32 +3,30 @@ package db import ( "testing" + "github.com/ethereum/go-ethereum/core/rawdb" "github.com/ethereum/go-ethereum/nomt/core" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) -func TestOpenClose(t *testing.T) { - dir := t.TempDir() - db, err := Open(dir, DefaultConfig()) +func newTestDB(t *testing.T) *DB { + t.Helper() + diskdb := rawdb.NewMemoryDatabase() + db, err := New(diskdb, DefaultConfig()) require.NoError(t, err) - - assert.Equal(t, core.Terminator, db.Root()) - require.NoError(t, db.Close()) + t.Cleanup(func() { db.Close() }) + return db } -func TestOpenCreatesDirectory(t *testing.T) { - dir := t.TempDir() + "/subdir" - db, err := Open(dir, DefaultConfig()) - require.NoError(t, err) - defer db.Close() +func TestNewClose(t *testing.T) { + db := newTestDB(t) assert.Equal(t, core.Terminator, db.Root()) } -func TestReopenPreservesState(t *testing.T) { - dir := t.TempDir() +func TestReopenPreservesRoot(t *testing.T) { + diskdb := rawdb.NewMemoryDatabase() - db, err := Open(dir, DefaultConfig()) + db, err := New(diskdb, DefaultConfig()) require.NoError(t, err) newRoot, err := db.Update([]core.StemKeyValue{ @@ -36,23 +34,17 @@ func TestReopenPreservesState(t *testing.T) { }) require.NoError(t, err) require.False(t, core.IsTerminator(&newRoot)) - require.NoError(t, db.Close()) - // Reopen and set the root. - db2, err := Open(dir, DefaultConfig()) + // "Reopen" by creating a new DB on the same ethdb. + db2, err := New(diskdb, DefaultConfig()) require.NoError(t, err) - defer db2.Close() - // Root is not automatically persisted (that's the geth integration's - // job), but the pages should still be on disk. - assert.Equal(t, core.Terminator, db2.Root()) + // Root is now persisted in PebbleDB, so it should be recovered. + assert.Equal(t, newRoot, db2.Root()) } func TestUpdateSingleKey(t *testing.T) { - dir := t.TempDir() - db, err := Open(dir, DefaultConfig()) - require.NoError(t, err) - defer db.Close() + db := newTestDB(t) newRoot, err := db.Update([]core.StemKeyValue{ {Stem: makeStem(0x10), Hash: makeHash(0x42)}, @@ -64,10 +56,7 @@ func TestUpdateSingleKey(t *testing.T) { } func TestUpdateMultipleKeys(t *testing.T) { - dir := t.TempDir() - db, err := Open(dir, DefaultConfig()) - require.NoError(t, err) - defer db.Close() + db := newTestDB(t) ops := []core.StemKeyValue{ {Stem: makeStem(0x10), Hash: makeHash(0x01)}, @@ -86,11 +75,7 @@ func TestUpdateDeterministic(t *testing.T) { } run := func() core.Node { - dir := t.TempDir() - db, err := Open(dir, DefaultConfig()) - require.NoError(t, err) - defer db.Close() - + db := newTestDB(t) root, err := db.Update(ops) require.NoError(t, err) return root @@ -102,10 +87,7 @@ func TestUpdateDeterministic(t *testing.T) { } func TestUpdateEmptyOps(t *testing.T) { - dir := t.TempDir() - db, err := Open(dir, DefaultConfig()) - require.NoError(t, err) - defer db.Close() + db := newTestDB(t) root, err := db.Update(nil) require.NoError(t, err) @@ -113,10 +95,7 @@ func TestUpdateEmptyOps(t *testing.T) { } func TestUpdateSortsByStem(t *testing.T) { - dir := t.TempDir() - db, err := Open(dir, DefaultConfig()) - require.NoError(t, err) - defer db.Close() + db := newTestDB(t) // Provide stems in reverse order — should still work. ops := []core.StemKeyValue{ @@ -129,27 +108,6 @@ func TestUpdateSortsByStem(t *testing.T) { assert.False(t, core.IsTerminator(&root)) } -func TestSyncSeqnIncrements(t *testing.T) { - dir := t.TempDir() - db, err := Open(dir, DefaultConfig()) - require.NoError(t, err) - defer db.Close() - - assert.Equal(t, uint32(0), db.SyncSeqn()) - - _, err = db.Update([]core.StemKeyValue{ - {Stem: makeStem(0x10), Hash: makeHash(0x01)}, - }) - require.NoError(t, err) - assert.Equal(t, uint32(1), db.SyncSeqn()) - - _, err = db.Update([]core.StemKeyValue{ - {Stem: makeStem(0x80), Hash: makeHash(0x02)}, - }) - require.NoError(t, err) - assert.Equal(t, uint32(2), db.SyncSeqn()) -} - func makeStem(b byte) core.StemPath { var sp core.StemPath for i := range sp { diff --git a/nomt/merkle/worker_test.go b/nomt/merkle/worker_test.go index f867489e4c..1b619cebbb 100644 --- a/nomt/merkle/worker_test.go +++ b/nomt/merkle/worker_test.go @@ -99,7 +99,7 @@ func TestAssignToWorkersMoreWorkersThanChildren(t *testing.T) { // --- Integration tests --- // permissivePageSet wraps MemoryPageSet to return fresh pages for missing -// entries (matching bitboxPageSet behavior). This is needed because the +// entries (matching pebblePageSet behavior). This is needed because the // parallel workers descend into child pages that may not exist yet. type permissivePageSet struct { *MemoryPageSet diff --git a/trie/nomttrie/compat_test.go b/trie/nomttrie/compat_test.go index 987024b8cf..5a39c88550 100644 --- a/trie/nomttrie/compat_test.go +++ b/trie/nomttrie/compat_test.go @@ -29,14 +29,11 @@ func newBintrie(t *testing.T) *bintrie.BinaryTrie { return bt } -// newNomtTrieForCompat creates a NomtTrie with in-memory ethdb and temp Bitbox. +// newNomtTrieForCompat creates a NomtTrie with in-memory ethdb. func newNomtTrieForCompat(t *testing.T) *NomtTrie { t.Helper() diskdb := rawdb.NewMemoryDatabase() - backend := nomtdb.New(diskdb, &nomtdb.Config{ - DataDir: t.TempDir(), - HTCapacity: 1 << 16, - }) + backend := nomtdb.New(diskdb, nil) t.Cleanup(func() { backend.Close() }) tr, err := New(common.Hash{}, backend) diff --git a/trie/nomttrie/trie_test.go b/trie/nomttrie/trie_test.go index 07a81c262b..c591128108 100644 --- a/trie/nomttrie/trie_test.go +++ b/trie/nomttrie/trie_test.go @@ -14,15 +14,11 @@ import ( "github.com/stretchr/testify/require" ) -// newTestTrie creates a NomtTrie backed by an in-memory ethdb and a temp -// Bitbox directory. Returns the trie and a cleanup function. +// newTestTrie creates a NomtTrie backed by an in-memory ethdb. func newTestTrie(t *testing.T) *NomtTrie { t.Helper() diskdb := rawdb.NewMemoryDatabase() - backend := nomtdb.New(diskdb, &nomtdb.Config{ - DataDir: t.TempDir(), - HTCapacity: 1 << 16, - }) + backend := nomtdb.New(diskdb, nil) t.Cleanup(func() { backend.Close() }) tr, err := New(common.Hash{}, backend) diff --git a/trie/triecompare/compare_test.go b/trie/triecompare/compare_test.go index b0bf5eabc9..5646f69574 100644 --- a/trie/triecompare/compare_test.go +++ b/trie/triecompare/compare_test.go @@ -2,9 +2,6 @@ package triecompare import ( "fmt" - "math/bits" - "os" - "path/filepath" "testing" "github.com/ethereum/go-ethereum/common" @@ -33,19 +30,15 @@ func newBintrie(t testing.TB) *bintrie.BinaryTrie { return bt } -func newNomtTrieWithDir(t testing.TB, htCapacity uint64) (*nomttrie.NomtTrie, string) { +func newNomtTrie(t testing.TB) *nomttrie.NomtTrie { t.Helper() diskdb := rawdb.NewMemoryDatabase() - dir := t.TempDir() - backend := nomtdb.New(diskdb, &nomtdb.Config{ - DataDir: dir, - HTCapacity: htCapacity, - }) + backend := nomtdb.New(diskdb, nil) t.Cleanup(func() { backend.Close() }) nt, err := nomttrie.New(common.Hash{}, backend) require.NoError(t, err) - return nt, dir + return nt } // applyOp applies a single StateOp to both bintrie and nomttrie. @@ -116,10 +109,9 @@ func TestRootEquality(t *testing.T) { for name, cfg := range configs { t.Run(name, func(t *testing.T) { blocks := GenerateBlocks(cfg) - htCap := estimateHTCapacity(cfg.NumAccounts, cfg.NumContracts, (cfg.MinSlots+cfg.MaxSlots)/2) bt := newBintrie(t) - nt, _ := newNomtTrieWithDir(t, htCap) + nt := newNomtTrie(t) for blockIdx, ops := range blocks { for _, op := range ops { @@ -144,11 +136,7 @@ func TestRootEquality(t *testing.T) { func TestDeterminism(t *testing.T) { computeRoot := func() common.Hash { blocks := GenerateBlocks(smallConfig) - htCap := estimateHTCapacity( - smallConfig.NumAccounts, smallConfig.NumContracts, - (smallConfig.MinSlots+smallConfig.MaxSlots)/2, - ) - nt, _ := newNomtTrieWithDir(t, htCap) + nt := newNomtTrie(t) bt := newBintrie(t) var root common.Hash for _, ops := range blocks { @@ -185,10 +173,9 @@ func TestDistributionVariants(t *testing.T) { cfg.Seed = 123 // same seed for all blocks := GenerateBlocks(cfg) - htCap := estimateHTCapacity(cfg.NumAccounts, cfg.NumContracts, (cfg.MinSlots+cfg.MaxSlots)/2) bt := newBintrie(t) - nt, _ := newNomtTrieWithDir(t, htCap) + nt := newNomtTrie(t) var binRoot, nomtRoot common.Hash for _, ops := range blocks { @@ -224,10 +211,9 @@ func TestIncrementalRootEquality(t *testing.T) { Seed: 99, } blocks := GenerateBlocks(cfg) - htCap := estimateHTCapacity(cfg.NumAccounts, cfg.NumContracts, 3) bt := newBintrie(t) - nt, _ := newNomtTrieWithDir(t, htCap) + nt := newNomtTrie(t) for i, op := range blocks[0] { applyOp(t, bt, nt, op) @@ -242,8 +228,8 @@ func TestIncrementalRootEquality(t *testing.T) { t.Logf("verified %d incremental hashes match", len(blocks[0])) } -// TestStorageFootprint populates state and measures storage used by each -// implementation. Logs sizes and ratio. +// TestStorageFootprint populates state and measures serialized node sizes +// for bintrie. NOMT pages are now in ethdb, so only bintrie size is reported. func TestStorageFootprint(t *testing.T) { if testing.Short() { t.Skip("storage footprint test requires medium config") @@ -251,10 +237,9 @@ func TestStorageFootprint(t *testing.T) { cfg := mediumConfig blocks := GenerateBlocks(cfg) - htCap := estimateHTCapacity(cfg.NumAccounts, cfg.NumContracts, (cfg.MinSlots+cfg.MaxSlots)/2) bt := newBintrie(t) - nt, nomtDir := newNomtTrieWithDir(t, htCap) + nt := newNomtTrie(t) for _, ops := range blocks { for _, op := range ops { @@ -271,13 +256,7 @@ func TestStorageFootprint(t *testing.T) { _, ns := bt.Commit(false) binBytes := nodesetBytes(ns) - // NOMT: sum file sizes on disk. - nomtBytes := dirSize(t, nomtDir) - - ratio := float64(nomtBytes) / float64(max(binBytes, 1)) t.Logf("bintrie serialized nodes: %s (%d bytes)", humanBytes(binBytes), binBytes) - t.Logf("NOMT bitbox on disk: %s (%d bytes)", humanBytes(nomtBytes), nomtBytes) - t.Logf("NOMT / bintrie ratio: %.2fx", ratio) } // --------------------------------------------------------------------------- @@ -288,7 +267,6 @@ func BenchmarkUpdateAccount(b *testing.B) { cfg := smallConfig blocks := GenerateBlocks(cfg) ops := filterOps(blocks[0], OpUpdateAccount) - htCap := estimateHTCapacity(cfg.NumAccounts, cfg.NumContracts, 10) b.Run("bintrie", func(b *testing.B) { bt := newBintrie(b) @@ -300,7 +278,7 @@ func BenchmarkUpdateAccount(b *testing.B) { }) b.Run("nomt", func(b *testing.B) { - nt, _ := newNomtTrieWithDir(b, htCap) + nt := newNomtTrie(b) b.ResetTimer() for i := range b.N { op := ops[i%len(ops)] @@ -313,7 +291,6 @@ func BenchmarkUpdateStorage(b *testing.B) { cfg := smallConfig blocks := GenerateBlocks(cfg) ops := filterOps(blocks[0], OpUpdateStorage) - htCap := estimateHTCapacity(cfg.NumAccounts, cfg.NumContracts, 10) b.Run("bintrie", func(b *testing.B) { bt := newBintrie(b) @@ -325,7 +302,7 @@ func BenchmarkUpdateStorage(b *testing.B) { }) b.Run("nomt", func(b *testing.B) { - nt, _ := newNomtTrieWithDir(b, htCap) + nt := newNomtTrie(b) b.ResetTimer() for i := range b.N { op := ops[i%len(ops)] @@ -347,7 +324,6 @@ func BenchmarkHash(b *testing.B) { Seed: 77, } blocks := GenerateBlocks(cfg) - htCap := estimateHTCapacity(size, 0, 0) b.Run("bintrie", func(b *testing.B) { bt := newBintrie(b) @@ -367,7 +343,7 @@ func BenchmarkHash(b *testing.B) { }) b.Run("nomt", func(b *testing.B) { - nt, _ := newNomtTrieWithDir(b, htCap) + nt := newNomtTrie(b) for _, op := range blocks[0] { _ = nt.UpdateAccount(op.Address, op.Account, op.CodeLen) } @@ -388,7 +364,6 @@ func BenchmarkHash(b *testing.B) { func BenchmarkBlockWorkload(b *testing.B) { cfg := smallConfig blocks := GenerateBlocks(cfg) - htCap := estimateHTCapacity(cfg.NumAccounts, cfg.NumContracts, 10) // Use block 1 (mutations) as the repeated workload. workload := blocks[1] @@ -411,7 +386,7 @@ func BenchmarkBlockWorkload(b *testing.B) { }) b.Run("nomt", func(b *testing.B) { - nt, _ := newNomtTrieWithDir(b, htCap) + nt := newNomtTrie(b) for _, op := range blocks[0] { applyOpSingleNomt(b, nt, op) } @@ -480,35 +455,6 @@ func nodesetBytes(ns *trienode.NodeSet) int64 { return total } -// dirSize walks a directory and returns total file size in bytes. -func dirSize(t testing.TB, dir string) int64 { - t.Helper() - var total int64 - err := filepath.Walk(dir, func(_ string, info os.FileInfo, err error) error { - if err != nil { - return err - } - if !info.IsDir() { - total += info.Size() - } - return nil - }) - require.NoError(t, err) - return total -} - -// estimateHTCapacity returns a power-of-2 hash table capacity for ~50% load. -// Each account uses ~1 stem; each contract uses 1 + ceil(avgSlots/256) stems. -func estimateHTCapacity(numAccounts, numContracts, avgSlots int) uint64 { - stems := numAccounts + numContracts - if avgSlots > 0 { - stems += numContracts * ((avgSlots + 255) / 256) - } - // 50% load factor → double the stem count, then round up to power of 2. - target := max(uint64(stems*2), 64) - return 1 << bits.Len64(target-1) -} - // humanBytes formats byte counts for log output. func humanBytes(b int64) string { switch { diff --git a/triedb/nomtdb/config.go b/triedb/nomtdb/config.go index 091eca5fae..8064e33355 100644 --- a/triedb/nomtdb/config.go +++ b/triedb/nomtdb/config.go @@ -8,15 +8,7 @@ package nomtdb // Config holds configuration for the NOMT triedb backend. type Config struct { - // DataDir is the directory for NOMT's Bitbox storage files. - DataDir string - - // HTCapacity is the number of hash table buckets. Must be a power of 2. - // Defaults to 1<<20 (~1M buckets) if zero. - HTCapacity uint64 -} - -// Defaults is the default configuration for the NOMT backend. -var Defaults = &Config{ - HTCapacity: 1 << 20, + // NumWorkers is the number of parallel goroutines for trie updates. + // Defaults to runtime.NumCPU() if zero. + NumWorkers int } diff --git a/triedb/nomtdb/database.go b/triedb/nomtdb/database.go index efbf0f9837..2baeea439e 100644 --- a/triedb/nomtdb/database.go +++ b/triedb/nomtdb/database.go @@ -11,23 +11,22 @@ import ( // Database is the NOMT triedb backend. It manages the NOMT trie engine for // page-based merkle storage and delegates flat state to geth's ethdb. type Database struct { - diskdb ethdb.Database // geth's existing PebbleDB for flat state + metadata - nomt *db.DB // NOMT trie engine (Bitbox page storage) + diskdb ethdb.Database // geth's existing PebbleDB for flat state + pages + nomt *db.DB // NOMT trie engine config *Config } -// New creates a new NOMT backend. The diskdb is used for flat state storage -// (accounts, storage slots) and NOMT metadata. The NOMT engine opens its own -// Bitbox files under config.DataDir. +// New creates a new NOMT backend. The diskdb is used for flat state storage, +// NOMT page storage, and metadata. Pass nil config for defaults. func New(diskdb ethdb.Database, config *Config) *Database { - if config.HTCapacity == 0 { - config.HTCapacity = Defaults.HTCapacity + if config == nil { + config = &Config{} } - nomtDB, err := db.Open(config.DataDir, db.Config{ - HTCapacity: config.HTCapacity, + nomtDB, err := db.New(diskdb, db.Config{ + NumWorkers: config.NumWorkers, }) if err != nil { - log.Crit("Failed to open NOMT database", "err", err) + log.Crit("Failed to create NOMT database", "err", err) } return &Database{ diskdb: diskdb,