diff --git a/cmd/geth/archivecmd.go b/cmd/geth/archivecmd.go new file mode 100644 index 0000000000..d6e241974e --- /dev/null +++ b/cmd/geth/archivecmd.go @@ -0,0 +1,576 @@ +// Copyright 2026 The go-ethereum Authors +// This file is part of go-ethereum. +// +// go-ethereum is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// go-ethereum is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with go-ethereum. If not, see . + +package main + +import ( + "encoding/binary" + "errors" + "fmt" + "os" + "path/filepath" + "slices" + "time" + + "github.com/ethereum/go-ethereum/cmd/utils" + "github.com/ethereum/go-ethereum/common" + "github.com/ethereum/go-ethereum/core/rawdb" + "github.com/ethereum/go-ethereum/core/types" + "github.com/ethereum/go-ethereum/crypto" + "github.com/ethereum/go-ethereum/ethdb" + "github.com/ethereum/go-ethereum/log" + "github.com/ethereum/go-ethereum/rlp" + "github.com/ethereum/go-ethereum/trie" + "github.com/ethereum/go-ethereum/trie/archive" + "github.com/ethereum/go-ethereum/triedb/database" + "github.com/urfave/cli/v2" +) + +var ( + // Flags for the archive command + archiveOutputFlag = &cli.StringFlag{ + Name: "output", + Usage: "Path to archive output file", + Value: "", // Default: /nodearchive + } + archiveCompactionIntervalFlag = &cli.Uint64Flag{ + Name: "compaction-interval", + Usage: "Run compaction after this many subtrees (0 = disable)", + Value: 1000, + } + archiveDryRunFlag = &cli.BoolFlag{ + Name: "dry-run", + Usage: "Simulate without modifying database", + } + + // Commands + archiveCheckNodeFlag = &cli.StringFlag{ + Name: "owner", + Usage: "Owner hash (hex) for the trie node to check", + } + archiveCheckPathFlag = &cli.StringFlag{ + Name: "path", + Usage: "Path (hex nibbles) of the trie node to check", + } + + archiveCommand = &cli.Command{ + Name: "archive", + Usage: "Archive state trie nodes to reduce database size", + Subcommands: []*cli.Command{ + archiveGenerateCmd, + archiveVerifyCmd, + archiveDeleteJournalCmd, + archiveCheckNodeCmd, + }, + } + + archiveCheckNodeCmd = &cli.Command{ + Name: "check-node", + Usage: "Check if a specific trie node exists in the raw DB", + Action: archiveCheckNode, + Flags: slices.Concat([]cli.Flag{ + archiveCheckNodeFlag, + archiveCheckPathFlag, + }, utils.NetworkFlags, utils.DatabaseFlags), + } + + archiveDeleteJournalCmd = &cli.Command{ + Name: "delete-journal", + Usage: "Delete the pathdb journal to force a clean restart", + Action: archiveDeleteJournal, + Flags: slices.Concat(utils.NetworkFlags, utils.DatabaseFlags), + Description: ` +Deletes the pathdb journal (TrieJournal key and merkle.journal file) from the +database. This forces geth to restart with a bare disk layer, discarding any +in-memory diff layers that may be inconsistent with archived state. + +Use this after running 'archive generate' if geth was started in between and +recreated the journal. + +Examples: + geth archive delete-journal --datadir /path/to/datadir + geth archive delete-journal --hoodi +`, + } + + archiveVerifyCmd = &cli.Command{ + Name: "verify", + Usage: "Verify all archived nodes can be correctly resurrected", + Action: archiveVerify, + Flags: slices.Concat(utils.NetworkFlags, utils.DatabaseFlags), + Description: ` +Walks the entire state trie, resolving every expired node from the archive +file and verifying that the reconstructed subtree hash matches the original. +Also walks all storage tries referenced by accounts. + +The database is opened read-only. No modifications are made. + +Examples: + geth archive verify --datadir /path/to/datadir + geth archive verify --hoodi +`, + } + + archiveGenerateCmd = &cli.Command{ + Name: "generate", + Usage: "Generate archive files from height-3 subtrees", + ArgsUsage: "[state-root]", + Action: archiveGenerate, + Flags: slices.Concat([]cli.Flag{ + archiveOutputFlag, + archiveCompactionIntervalFlag, + archiveDryRunFlag, + }, utils.NetworkFlags, utils.DatabaseFlags), + Description: ` +Walks the state trie of the specified root (or head block) and archives +subtrees at height 3. Each archived subtree is replaced with an expiredNode +that references the archive file offset and size. + +Height is measured from leaves: leaves=0, parents=1, etc. A height-3 node +has leaves at most 3 levels below it. + +The archiver reads trie nodes directly from the persistent database layer, +bypassing any in-memory diff layers. This ensures consistency between the +data it reads and the data it modifies. + +Examples: + # Archive from the persistent disk state + geth archive generate --datadir /path/to/datadir + + # Dry run to see what would be archived + geth archive generate --dry-run --datadir /path/to/datadir + + # Custom output and compaction interval + geth archive generate --output /path/to/archive --compaction-interval 500 +`, + } +) + +// rawDBNodeReader implements database.NodeReader by reading trie nodes directly +// from the raw key-value database, bypassing pathdb's in-memory diff layers. +// This ensures the archiver sees the same trie state it modifies. +type rawDBNodeReader struct { + db ethdb.KeyValueReader +} + +func (r *rawDBNodeReader) Node(owner common.Hash, path []byte, hash common.Hash) ([]byte, error) { + var blob []byte + if owner == (common.Hash{}) { + blob = rawdb.ReadAccountTrieNode(r.db, path) + } else { + blob = rawdb.ReadStorageTrieNode(r.db, owner, path) + } + // Skip hash verification: the raw DB may contain expiredNode markers + // (blob[0] == 0x00) which have different hashes than the original nodes. + return blob, nil +} + +// rawDBNodeDatabase implements database.NodeDatabase using direct raw DB reads. +type rawDBNodeDatabase struct { + db ethdb.KeyValueReader + root common.Hash +} + +func (d *rawDBNodeDatabase) NodeReader(stateRoot common.Hash) (database.NodeReader, error) { + // Only allow reading the persistent disk root state + if stateRoot != d.root { + return nil, fmt.Errorf("raw DB reader only supports disk root %x, got %x", d.root, stateRoot) + } + return &rawDBNodeReader{db: d.db}, nil +} + +func archiveGenerate(ctx *cli.Context) error { + // 1. Setup node and databases + stack, _ := makeConfigNode(ctx) + defer stack.Close() + + dryRun := ctx.Bool(archiveDryRunFlag.Name) + chaindb := utils.MakeChainDatabase(ctx, stack, dryRun) + defer chaindb.Close() + + // Check state scheme - we only support PathDB + scheme := cycleCheckScheme(ctx, chaindb) + if scheme != rawdb.PathScheme { + return fmt.Errorf("archive generation requires path-based state scheme, got: %s", scheme) + } + + // 2. Flush diff layers to disk via pathdb. This ensures the raw DB + // contains the complete, up-to-date state trie and that state history + // entries are properly written to the freezer. + trieDB := utils.MakeTrieDatabase(ctx, stack, chaindb, false, dryRun, false) + head, hasDiff := trieDB.DiffHead() + if hasDiff { + log.Info("Flushing diff layers to disk", "head", head) + if err := trieDB.Commit(head, true); err != nil { + trieDB.Close() + return fmt.Errorf("failed to flush diff layers: %w", err) + } + log.Info("Diff layers flushed successfully") + } else { + log.Info("No diff layers to flush, disk state is current", "root", head) + } + // Close triedb — we work directly with raw DB for archival. + // We'll re-open it at the end to write a fresh journal. + trieDB.Close() + + // 3. Determine the disk state root (now up-to-date after flush). + rootBlob := rawdb.ReadAccountTrieNode(chaindb, nil) + if len(rootBlob) == 0 { + return errors.New("state trie not found in database") + } + root := crypto.Keccak256Hash(rootBlob) + log.Info("Using disk state root", "root", root) + + // Create a raw DB node reader that bypasses pathdb layers + nodeDB := &rawDBNodeDatabase{db: chaindb, root: root} + + // 4. Open archive writer (unless dry-run). + // The archive file is placed at /geth/nodearchive by default, + // matching the path used by ArchivedNodeResolver when reading back. + var writer *archive.ArchiveWriter + archivePath := ctx.String(archiveOutputFlag.Name) + if archivePath == "" { + archivePath = filepath.Join(stack.ResolvePath(""), "nodearchive") + } + + if !dryRun { + var err error + writer, err = archive.NewArchiveWriter(archivePath) + if err != nil { + return fmt.Errorf("failed to open archive file %s: %w", archivePath, err) + } + defer writer.Close() + log.Info("Opened archive file", "path", archivePath) + } else { + log.Info("Dry run mode - no changes will be made") + } + + // 5. Create and run archiver + archiver := trie.NewArchiver( + chaindb, + nodeDB, + writer, + ctx.Uint64(archiveCompactionIntervalFlag.Name), + dryRun, + ) + + start := time.Now() + if err := archiver.ProcessState(root); err != nil { + return fmt.Errorf("archive generation failed: %w", err) + } + + // 6. Get stats and optionally run final compaction + subtrees, leaves, bytesDeleted := archiver.Stats() + + if !dryRun && subtrees > 0 { + log.Info("Running final database compaction") + if err := chaindb.Compact(nil, nil); err != nil { + log.Warn("Final compaction failed", "err", err) + } + } + + // 7. Re-journal the pathdb state with the current disk root. + // After archiving, some trie nodes have been replaced with expired + // markers. We re-open pathdb and write a fresh journal (disk layer + // only, since all diff layers were flushed in step 2) so that geth + // can restart cleanly. + if !dryRun { + log.Info("Re-journaling pathdb state") + freshTrieDB := utils.MakeTrieDatabase(ctx, stack, chaindb, false, false, false) + freshRoot := crypto.Keccak256Hash(rawdb.ReadAccountTrieNode(chaindb, nil)) + if err := freshTrieDB.Journal(freshRoot); err != nil { + log.Warn("Failed to re-journal pathdb state", "err", err) + } + freshTrieDB.Close() + } + + // 8. Print summary + var archiveSize uint64 + if writer != nil { + archiveSize = writer.Offset() + } + + log.Info("Archive generation complete", + "subtrees", subtrees, + "leaves", leaves, + "bytesDeleted", bytesDeleted, + "archiveSize", archiveSize, + "elapsed", common.PrettyDuration(time.Since(start))) + + if dryRun { + log.Info("This was a dry run - no changes were made to the database") + } + + return nil +} + +func archiveVerify(ctx *cli.Context) error { + stack, _ := makeConfigNode(ctx) + defer stack.Close() + + // Open database read-only + chaindb := utils.MakeChainDatabase(ctx, stack, true) + defer chaindb.Close() + + scheme := cycleCheckScheme(ctx, chaindb) + if scheme != rawdb.PathScheme { + return fmt.Errorf("archive verify requires path-based state scheme, got: %s", scheme) + } + + // Set archive data dir so ArchivedNodeResolver can find the file + // ResolvePath("") returns the node's data directory (e.g. .ethereum/hoodi/geth), + // but ArchivedNodeResolver expects the instance directory (.ethereum/hoodi) + // since it appends "geth/nodearchive" itself. + archive.ArchiveDataDir = filepath.Dir(stack.ResolvePath("")) + + // Compute disk root + rootBlob := rawdb.ReadAccountTrieNode(chaindb, nil) + if len(rootBlob) == 0 { + return errors.New("state trie not found in database") + } + root := crypto.Keccak256Hash(rootBlob) + log.Info("Verifying archived nodes", "root", root) + + nodeDB := &rawDBNodeDatabase{db: chaindb, root: root} + + // Open account trie + accountTrie, err := trie.New(trie.StateTrieID(root), nodeDB) + if err != nil { + return fmt.Errorf("failed to open account trie: %w", err) + } + + var ( + totalAccounts int + totalStorageTries int + totalLeaves int + totalExpired int + totalErrors int + start = time.Now() + lastLog = time.Now() + ) + + // Walk the account trie — this resolves all expired nodes and verifies hashes + accountStats, err := accountTrie.Walk(func(path []byte, value []byte) error { + totalAccounts++ + if time.Since(lastLog) > 30*time.Second { + log.Info("Verification progress", + "accounts", totalAccounts, + "storageTries", totalStorageTries, + "leaves", totalLeaves, + "expired", totalExpired, + "errors", totalErrors) + lastLog = time.Now() + } + + // Decode account to check for storage trie + var acc types.StateAccount + if err := rlp.DecodeBytes(value, &acc); err != nil { + log.Warn("Failed to decode account", "err", err) + totalErrors++ + return nil // continue walking + } + if acc.Root == types.EmptyRootHash { + return nil + } + + // Open and walk storage trie. + // path is hex-nibble encoded (with a 16 terminator from the trie key), + // so convert nibble pairs back to the 32-byte account hash. + nibbles := path + if len(nibbles) > 0 && nibbles[len(nibbles)-1] == 16 { + nibbles = nibbles[:len(nibbles)-1] + } + keyBytes := make([]byte, len(nibbles)/2) + for i := 0; i < len(nibbles); i += 2 { + keyBytes[i/2] = nibbles[i]<<4 | nibbles[i+1] + } + accountHash := common.BytesToHash(keyBytes) + storageID := trie.StorageTrieID(root, accountHash, acc.Root) + storageTrie, err := trie.New(storageID, nodeDB) + if err != nil { + log.Warn("Failed to open storage trie", "account", accountHash, "err", err) + totalErrors++ + return nil + } + + storageStats, err := storageTrie.Walk(func(spath []byte, svalue []byte) error { + return nil + }) + if err != nil { + log.Warn("Storage trie walk failed", "account", accountHash, "err", err) + totalErrors++ + return nil + } + totalStorageTries++ + totalLeaves += storageStats.Leaves + totalExpired += storageStats.ExpiredResolved + return nil + }) + if err != nil { + return fmt.Errorf("account trie walk failed: %w", err) + } + + totalLeaves += accountStats.Leaves + totalExpired += accountStats.ExpiredResolved + + log.Info("Archive verification complete", + "accounts", totalAccounts, + "storageTries", totalStorageTries, + "totalLeaves", totalLeaves, + "expiredResolved", totalExpired, + "errors", totalErrors, + "elapsed", common.PrettyDuration(time.Since(start))) + + if totalErrors > 0 { + return fmt.Errorf("verification completed with %d errors", totalErrors) + } + return nil +} + +func archiveDeleteJournal(ctx *cli.Context) error { + stack, _ := makeConfigNode(ctx) + defer stack.Close() + + chaindb := utils.MakeChainDatabase(ctx, stack, false) + defer chaindb.Close() + + // Delete the pathdb journal KV key + if err := chaindb.Delete([]byte("TrieJournal")); err != nil { + log.Warn("Failed to delete pathdb journal key", "err", err) + } else { + log.Info("Deleted pathdb journal key (TrieJournal)") + } + + // Delete the journal file(s) - check both legacy and current locations + for _, dir := range []string{"triedb", ""} { + for _, name := range []string{"merkle.journal", "verkle.journal"} { + journalFile := filepath.Join(stack.ResolvePath(dir), name) + if err := os.Remove(journalFile); err == nil { + log.Info("Deleted journal file", "path", journalFile) + } else if !os.IsNotExist(err) { + log.Warn("Failed to delete journal file", "path", journalFile, "err", err) + } + } + } + + return nil +} + +func archiveCheckNode(ctx *cli.Context) error { + stack, _ := makeConfigNode(ctx) + defer stack.Close() + + chaindb := utils.MakeChainDatabase(ctx, stack, true) + defer chaindb.Close() + + ownerHex := ctx.String(archiveCheckNodeFlag.Name) + pathHex := ctx.String(archiveCheckPathFlag.Name) + + if ownerHex == "" { + return errors.New("--owner flag is required") + } + + owner := common.HexToHash(ownerHex) + + // Parse path: hex nibbles like "08" → []byte{0, 8} + var path []byte + for _, c := range pathHex { + var nibble byte + switch { + case c >= '0' && c <= '9': + nibble = byte(c - '0') + case c >= 'a' && c <= 'f': + nibble = byte(c-'a') + 10 + case c >= 'A' && c <= 'F': + nibble = byte(c-'A') + 10 + default: + return fmt.Errorf("invalid hex char in path: %c", c) + } + path = append(path, nibble) + } + + log.Info("Checking node in raw DB", "owner", owner, "path", fmt.Sprintf("%x", path)) + + // Read the node directly from the raw DB + isAccount := owner == (common.Hash{}) + + // Check the target path and all prefixes up to root + for i := len(path); i >= 0; i-- { + subpath := path[:i] + var blob []byte + if isAccount { + blob = rawdb.ReadAccountTrieNode(chaindb, subpath) + } else { + blob = rawdb.ReadStorageTrieNode(chaindb, owner, subpath) + } + + status := "MISSING" + details := "" + if len(blob) > 0 { + if blob[0] == 0x00 { + status = "EXPIRED" + if len(blob) == 17 { + offset := binary.BigEndian.Uint64(blob[1:9]) + size := binary.BigEndian.Uint64(blob[9:17]) + details = fmt.Sprintf("offset=%d size=%d", offset, size) + } + } else { + status = fmt.Sprintf("PRESENT (%d bytes, first=0x%02x)", len(blob), blob[0]) + } + } + label := "prefix" + if i == len(path) { + label = "TARGET" + } + if i == 0 { + label = "ROOT" + } + log.Info("Node check", + "label", label, + "path", fmt.Sprintf("%x", subpath), + "pathLen", i, + "status", status, + "details", details) + } + + // Also check a few child paths to see what's below the target + for nibble := byte(0); nibble < 16; nibble++ { + childPath := append(append([]byte{}, path...), nibble) + var blob []byte + if isAccount { + blob = rawdb.ReadAccountTrieNode(chaindb, childPath) + } else { + blob = rawdb.ReadStorageTrieNode(chaindb, owner, childPath) + } + if len(blob) > 0 { + status := fmt.Sprintf("PRESENT (%d bytes, first=0x%02x)", len(blob), blob[0]) + if blob[0] == 0x00 && len(blob) == 17 { + offset := binary.BigEndian.Uint64(blob[1:9]) + size := binary.BigEndian.Uint64(blob[9:17]) + status = fmt.Sprintf("EXPIRED offset=%d size=%d", offset, size) + } + log.Info("Child node", "path", fmt.Sprintf("%x", childPath), "status", status) + } + } + + return nil +} + +// cycleCheckScheme returns the state scheme for the database. +// It's a helper to check what scheme is in use. +func cycleCheckScheme(ctx *cli.Context, db ethdb.Database) string { + return rawdb.ReadStateScheme(db) +} diff --git a/cmd/geth/main.go b/cmd/geth/main.go index 5e90164aaa..528bd8400d 100644 --- a/cmd/geth/main.go +++ b/cmd/geth/main.go @@ -240,6 +240,8 @@ func init() { dumpConfigCommand, // see dbcmd.go dbCommand, + // See archivecmd.go + archiveCommand, // See cmd/utils/flags_legacy.go utils.ShowDeprecated, // See snapshot.go diff --git a/node/node.go b/node/node.go index 7c0d69775c..d9c38c4b6c 100644 --- a/node/node.go +++ b/node/node.go @@ -38,6 +38,7 @@ import ( "github.com/ethereum/go-ethereum/log" "github.com/ethereum/go-ethereum/p2p" "github.com/ethereum/go-ethereum/rpc" + "github.com/ethereum/go-ethereum/trie/archive" "github.com/gofrs/flock" ) @@ -85,6 +86,7 @@ func New(conf *Config) (*Node, error) { return nil, err } conf.DataDir = absdatadir + archive.ArchiveDataDir = absdatadir } if conf.Logger == nil { conf.Logger = log.New() diff --git a/trie/archive/archive.go b/trie/archive/archive.go new file mode 100644 index 0000000000..d4f4fb2382 --- /dev/null +++ b/trie/archive/archive.go @@ -0,0 +1,95 @@ +// Copyright 2026 go-ethereum Authors +// This file is part of the go-ethereum library. +// +// The go-ethereum library is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// The go-ethereum library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with the go-ethereum library. If not, see . + +package archive + +import ( + "bytes" + "errors" + "fmt" + "io" + "os" + "path/filepath" + + "github.com/ethereum/go-ethereum/rlp" +) + +// ResolverFn is a callback to resolve expired nodes from an archive file. +// Given an offset and size, it returns the serialized node data from the archive. +type ResolverFn func(offset, size uint64) ([]*Record, error) + +// OffsetSize is the size of the file offset in bytes. +const OffsetSize = 8 + +var ( + EmptyArchiveRecord = errors.New("empty record") // The archive contained a size-zero record. + ErrNoResolver = errors.New("no archive resolver set for expired node") // An expired node is accessed without a resolver. +) + +// Record contains an archive file record. It is not the most optimal +// structure, since any modification to it will need to be overwritten. +type Record struct { + Path []byte + Value []byte +} + +// ArchiveDataDir is the data directory where the archive file is stored. +var ArchiveDataDir string + +// ArchivedNodeResolver takes a buffer containing the archive data +// held by an expiring node (an offset and a size) and returns a +// list of records, which is a list of serialized leaf nodes. The +// caller knows the context (MPT, binary trie) and is responsible +// for decoding the nodes. +func ArchivedNodeResolver(offset, size uint64) ([]*Record, error) { + file, err := os.Open(filepath.Join(ArchiveDataDir, "geth", "nodearchive")) + if err != nil { + return nil, fmt.Errorf("error opening archive file: %w", err) + } + defer file.Close() + + o, err := file.Seek(int64(offset), io.SeekStart) + if err != nil { + return nil, fmt.Errorf("error seeking into archive file: %w", err) + } + if uint64(o) != offset { + return nil, fmt.Errorf("invalid offset: want %d, got %d", offset, o) + } + + data := make([]byte, size) + if _, err := io.ReadFull(file, data); err != nil { + return nil, fmt.Errorf("error reading data from archive: %w", err) + } + + var records []*Record + stream := rlp.NewStream(bytes.NewReader(data), uint64(len(data))) + for len(data) > 0 { + _, size, err := stream.Kind() + if err == io.EOF { + break + } + if err != nil { + return nil, fmt.Errorf("error getting rlp kind from archive data: %w", err) + } + var record Record + err = stream.Decode(&record) + if err != nil { + return nil, fmt.Errorf("error decoding rlp record from archive data (offset=%d, size=%d): %w", offset, size, err) + } + records = append(records, &record) + } + return records, nil +} diff --git a/trie/archive/writer.go b/trie/archive/writer.go new file mode 100644 index 0000000000..98b4ecce4b --- /dev/null +++ b/trie/archive/writer.go @@ -0,0 +1,92 @@ +// Copyright 2026 go-ethereum Authors +// This file is part of the go-ethereum library. +// +// The go-ethereum library is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// The go-ethereum library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with the go-ethereum library. If not, see . + +package archive + +import ( + "os" + "sync" + + "github.com/ethereum/go-ethereum/rlp" +) + +// ArchiveWriter is an append-only writer for archive files. +// It writes RLP-encoded records to a file and tracks the current offset. +type ArchiveWriter struct { + file *os.File + offset uint64 + mu sync.Mutex +} + +// NewArchiveWriter creates a new archive writer that appends to the given file. +// If the file exists, it will be opened in append mode and writing continues +// from the current end of file. If it doesn't exist, it will be created. +func NewArchiveWriter(path string) (*ArchiveWriter, error) { + file, err := os.OpenFile(path, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + return nil, err + } + info, err := file.Stat() + if err != nil { + file.Close() + return nil, err + } + return &ArchiveWriter{ + file: file, + offset: uint64(info.Size()), + }, nil +} + +// WriteSubtree writes all records belonging to a subtree and returns +// the starting offset and total size of the written data. +// This is the atomic unit of archival - all records for a subtree are +// written together and can be retrieved together using the returned +// offset and size. +func (w *ArchiveWriter) WriteSubtree(records []*Record) (offset uint64, size uint64, err error) { + w.mu.Lock() + defer w.mu.Unlock() + + startOffset := w.offset + for _, rec := range records { + encoded, err := rlp.EncodeToBytes(rec) + if err != nil { + return 0, 0, err + } + if _, err := w.file.Write(encoded); err != nil { + return 0, 0, err + } + w.offset += uint64(len(encoded)) + } + return startOffset, w.offset - startOffset, nil +} + +// Sync flushes the file to disk. This should be called after writing +// a subtree and before modifying the database to ensure crash consistency. +func (w *ArchiveWriter) Sync() error { + return w.file.Sync() +} + +// Close closes the archive file. +func (w *ArchiveWriter) Close() error { + return w.file.Close() +} + +// Offset returns the current write offset in the file. +func (w *ArchiveWriter) Offset() uint64 { + w.mu.Lock() + defer w.mu.Unlock() + return w.offset +} diff --git a/trie/archiver.go b/trie/archiver.go new file mode 100644 index 0000000000..32b2eae711 --- /dev/null +++ b/trie/archiver.go @@ -0,0 +1,599 @@ +// Copyright 2026 go-ethereum Authors +// This file is part of the go-ethereum library. +// +// The go-ethereum library is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// The go-ethereum library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with the go-ethereum library. If not, see . + +package trie + +import ( + "encoding/binary" + "fmt" + "time" + + "github.com/ethereum/go-ethereum/common" + "github.com/ethereum/go-ethereum/core/rawdb" + "github.com/ethereum/go-ethereum/core/types" + "github.com/ethereum/go-ethereum/ethdb" + "github.com/ethereum/go-ethereum/log" + "github.com/ethereum/go-ethereum/rlp" + "github.com/ethereum/go-ethereum/trie/archive" + "github.com/ethereum/go-ethereum/triedb/database" +) + +// subtreeInfo holds information about a subtree to be archived. +// It contains all the data needed to write the subtree to an archive +// and replace it with an expiredNode in the database. +type subtreeInfo struct { + path []byte // Hex-encoded path to subtree root + owner common.Hash // Zero for account trie, account hash for storage + height int // Height of subtree (from leaves) + leaves []*archive.Record // All leaf records (relative path + encoded node) + nodePaths [][]byte // Paths of all nodes to delete + rootHash common.Hash // Hash of the original subtree root (for verification) +} + +// Archiver handles the archival process of trie nodes. +// It walks the state trie, identifies subtrees at height 3, +// archives their leaf data, and replaces them with expiredNode markers. +// +// The archiver uses a streaming approach: it walks the trie using a +// NodeIterator, probes each node's height via bounded raw DB reads, +// and archives subtrees immediately when found. This keeps memory +// usage proportional to the iterator stack depth + the current subtree +// being processed, rather than loading the entire trie into memory. +type Archiver struct { + db ethdb.Database + triedb database.NodeDatabase + writer *archive.ArchiveWriter + compactionInterval uint64 + dryRun bool + stateRoot common.Hash + + // Progress tracking + subtreesArchived uint64 + bytesDeleted uint64 + leavesArchived uint64 + lastCompaction uint64 +} + +// NewArchiver creates a new archiver instance. +// +// Parameters: +// - db: The underlying key-value database +// - triedb: The trie database for reading nodes +// - writer: Archive file writer (can be nil for dry run) +// - compactionInterval: Run compaction after this many subtrees (0 = disable) +// - dryRun: If true, don't modify the database +func NewArchiver(db ethdb.Database, triedb database.NodeDatabase, + writer *archive.ArchiveWriter, compactionInterval uint64, dryRun bool) *Archiver { + return &Archiver{ + db: db, + triedb: triedb, + writer: writer, + compactionInterval: compactionInterval, + dryRun: dryRun, + } +} + +// ProcessState archives subtrees from the given state root. +// It processes storage tries first, then the account trie. +func (a *Archiver) ProcessState(root common.Hash) error { + a.stateRoot = root + + accountTrie, err := New(StateTrieID(root), a.triedb) + if err != nil { + return fmt.Errorf("failed to open account trie: %w", err) + } + + log.Info("Processing storage tries") + iter, err := accountTrie.NodeIterator(nil) + if err != nil { + return fmt.Errorf("failed to create account iterator: %w", err) + } + + kvIter := NewIterator(iter) + for kvIter.Next() { + // Decode the account to check for storage + var acc types.StateAccount + if err := rlp.DecodeBytes(kvIter.Value, &acc); err != nil { + log.Warn("Failed to decode account", "err", err) + continue + } + if acc.Root == types.EmptyRootHash { + continue + } + + // Process this account's storage trie + accountHash := common.BytesToHash(kvIter.Key) + storageID := StorageTrieID(root, accountHash, acc.Root) + storageTrie, err := New(storageID, a.triedb) + if err != nil { + log.Warn("Failed to open storage trie", "account", accountHash, "err", err) + continue + } + + if err := a.processTrie(accountHash, storageTrie); err != nil { + log.Warn("Failed to process storage trie", "account", accountHash, "err", err) + } + } + + if kvIter.Err != nil { + return fmt.Errorf("account iteration error: %w", kvIter.Err) + } + + log.Info("Processing account trie", "root", root) + if err := a.processTrie(common.Hash{}, accountTrie); err != nil { + return fmt.Errorf("failed to process account trie: %w", err) + } + + return nil +} + +// processTrie finds and archives all height-3 subtrees in the trie using +// a streaming approach. It walks the trie with a NodeIterator, probes each +// node's height via bounded raw DB reads, and archives subtrees immediately. +// +// Memory usage is O(iterator_stack_depth + current_subtree_size) instead of +// O(entire_trie) as with the previous recursive approach. +func (a *Archiver) processTrie(owner common.Hash, t *Trie) error { + if t.root == nil { + return nil + } + + iter, err := t.NodeIterator(nil) + if err != nil { + return fmt.Errorf("failed to create node iterator: %w", err) + } + + var ( + lastLog = time.Now() + found uint64 + ) + + for iter.Next(true) { + if iter.Leaf() { + continue + } + + // Progress logging + if time.Since(lastLog) > 30*time.Second { + log.Info("Scanning trie for subtrees", + "owner", owner, + "path", common.Bytes2Hex(iter.Path()), + "found", found, + "archived", a.subtreesArchived) + lastLog = time.Now() + } + + path := copyBytes(iter.Path()) + hash := iter.Hash() + if hash == (common.Hash{}) { + // Embedded node (no hash), skip — it will be part of a + // parent subtree. + continue + } + + // Probe subtree height via bounded raw DB reads. + // This does NOT load the trie into memory — it reads blobs from + // the DB, decodes them, computes height, and discards them. + height := a.probeHeight(owner, path, hash, 3) + if height != 3 { + // Too small to archive; the iterator will visit children. + // Too tall — descend into children to find height-3 subtrees. + continue + } + + // height == 3: collect and archive this subtree immediately. + info := a.collectSubtree(owner, path, hash) + if info == nil { + continue + } + found++ + + if err := a.archiveSubtree(info); err != nil { + log.Warn("Failed to archive subtree", "path", common.Bytes2Hex(path), "err", err) + continue + } + a.subtreesArchived++ + a.leavesArchived += uint64(len(info.leaves)) + + if err := a.maybeCompact(); err != nil { + log.Warn("Compaction failed", "err", err) + } + + // Skip children — they're now archived. + // We call Next(false) to move past the subtree without descending. + iter.Next(false) + } + + if iter.Error() != nil { + return fmt.Errorf("iterator error: %w", iter.Error()) + } + + log.Info("Found subtrees to archive", "owner", owner, "count", found) + return nil +} + +// probeHeight computes the height of a node by reading from the raw DB. +// It stops early once height exceeds maxHeight (returns maxHeight+1). +// The decoded nodes are not retained — they are discarded after inspection. +// +// Height is measured from leaves: leaves=0, their parents=1, etc. +func (a *Archiver) probeHeight(owner common.Hash, path []byte, hash common.Hash, maxHeight int) int { + blob := a.readNodeBlob(owner, path) + if len(blob) == 0 { + return 0 + } + + // Already expired — skip. + if blob[0] == expiredNodeMarker { + return -1 + } + + n, err := decodeNodeUnsafe(hash[:], blob) + if err != nil { + return 0 + } + + return a.nodeHeight(n, path, owner, maxHeight) +} + +// nodeHeight computes the height of a decoded node, bounded by maxHeight. +// Returns maxHeight+1 early if the subtree is taller than maxHeight. +func (a *Archiver) nodeHeight(n node, path []byte, owner common.Hash, maxHeight int) int { + switch n := n.(type) { + case nil: + return 0 + + case valueNode: + return 0 + + case *shortNode: + childPath := append(append([]byte{}, path...), n.Key...) + switch child := n.Val.(type) { + case valueNode: + return 1 // shortNode → leaf + case hashNode: + if maxHeight <= 1 { + return maxHeight + 1 + } + childHeight := a.probeHeight(owner, childPath, common.BytesToHash(child), maxHeight-1) + if childHeight < 0 { + return -1 // expired child + } + return childHeight + 1 + default: + // Inline node + childHeight := a.nodeHeight(child, childPath, owner, maxHeight-1) + if childHeight < 0 { + return -1 + } + return childHeight + 1 + } + + case *fullNode: + maxH := 0 + for i, child := range n.Children[:16] { + if child == nil { + continue + } + childPath := append(append([]byte{}, path...), byte(i)) + var childHeight int + switch c := child.(type) { + case valueNode: + childHeight = 0 + case hashNode: + if maxH+1 > maxHeight { + return maxHeight + 1 + } + childHeight = a.probeHeight(owner, childPath, common.BytesToHash(c), maxHeight-1) + default: + childHeight = a.nodeHeight(c, childPath, owner, maxHeight-1) + } + if childHeight < 0 { + continue // expired child, skip + } + h := childHeight + 1 + if h > maxH { + maxH = h + } + if maxH > maxHeight { + return maxHeight + 1 + } + } + return maxH + + case hashNode: + return a.probeHeight(owner, path, common.BytesToHash(n), maxHeight) + + case *expiredNode: + return -1 + } + return 0 +} + +// collectSubtree reads a height-3 subtree from the raw DB and collects its +// leaves and node paths for archival. The subtree is bounded (height ≤ 3), +// so memory usage is limited. +func (a *Archiver) collectSubtree(owner common.Hash, path []byte, hash common.Hash) *subtreeInfo { + blob := a.readNodeBlob(owner, path) + if len(blob) == 0 { + return nil + } + if blob[0] == expiredNodeMarker { + return nil + } + + n, err := decodeNodeUnsafe(hash[:], blob) + if err != nil { + log.Warn("Failed to decode node for collection", "path", common.Bytes2Hex(path), "err", err) + return nil + } + + info := &subtreeInfo{ + path: copyBytes(path), + owner: owner, + rootHash: hash, + } + + leaves, nodePaths, height, err := a.collectNodeLeaves(n, path, nil, owner) + if err != nil { + log.Warn("Failed to collect subtree leaves", "path", common.Bytes2Hex(path), "err", err) + return nil + } + + info.height = height + info.leaves = leaves + info.nodePaths = append([][]byte{copyBytes(path)}, nodePaths...) + return info +} + +// collectNodeLeaves recursively collects all leaves and node paths in a +// bounded subtree. relPath is the path relative to the subtree root. +// Returns (leaves, nodePaths, height, error). +func (a *Archiver) collectNodeLeaves(n node, absPath, relPath []byte, owner common.Hash) ([]*archive.Record, [][]byte, int, error) { + switch n := n.(type) { + case nil: + return nil, nil, 0, nil + + case valueNode: + return []*archive.Record{{ + Path: copyBytes(relPath), + Value: []byte(n), + }}, nil, 0, nil + + case *shortNode: + childAbsPath := append(append([]byte{}, absPath...), n.Key...) + var childNode node + switch c := n.Val.(type) { + case hashNode: + resolved, err := a.resolveRawNode(owner, childAbsPath, common.BytesToHash(c)) + if err != nil { + return nil, nil, 0, fmt.Errorf("resolve shortNode child at %s: %w", common.Bytes2Hex(childAbsPath), err) + } + childNode = resolved + default: + childNode = c + } + + // Pass nil relPath to child — we prepend the key ourselves + leaves, nodePaths, height, err := a.collectNodeLeaves(childNode, childAbsPath, nil, owner) + if err != nil { + return nil, nil, 0, err + } + + // Prepend [relPath + extension key] to leaf relative paths + prefix := append(append([]byte{}, relPath...), n.Key...) + for _, leaf := range leaves { + leaf.Path = append(append([]byte{}, prefix...), leaf.Path...) + } + + return leaves, append([][]byte{copyBytes(absPath)}, nodePaths...), height + 1, nil + + case *fullNode: + var ( + allLeaves []*archive.Record + allPaths [][]byte + maxHeight int + ) + for i, child := range n.Children[:16] { + if child == nil { + continue + } + childAbsPath := append(append([]byte{}, absPath...), byte(i)) + + var childNode node + switch c := child.(type) { + case hashNode: + resolved, err := a.resolveRawNode(owner, childAbsPath, common.BytesToHash(c)) + if err != nil { + return nil, nil, 0, fmt.Errorf("resolve fullNode child[%x] at %s: %w", i, common.Bytes2Hex(childAbsPath), err) + } + childNode = resolved + default: + childNode = c + } + + // Pass nil relPath to child — we prepend the index ourselves + leaves, nodePaths, height, err := a.collectNodeLeaves(childNode, childAbsPath, nil, owner) + if err != nil { + return nil, nil, 0, err + } + + // Prepend [relPath + branch index] to leaf relative paths + prefix := append(append([]byte{}, relPath...), byte(i)) + for _, leaf := range leaves { + leaf.Path = append(append([]byte{}, prefix...), leaf.Path...) + } + + allLeaves = append(allLeaves, leaves...) + allPaths = append(allPaths, nodePaths...) + h := height + 1 + if h > maxHeight { + maxHeight = h + } + } + return allLeaves, allPaths, maxHeight, nil + + case hashNode: + resolved, err := a.resolveRawNode(owner, absPath, common.BytesToHash(n)) + if err != nil { + return nil, nil, 0, err + } + return a.collectNodeLeaves(resolved, absPath, relPath, owner) + + case *expiredNode: + return nil, nil, 0, nil + } + return nil, nil, 0, nil +} + +// readNodeBlob reads a trie node blob directly from the raw key-value +// database, bypassing pathdb layers. +func (a *Archiver) readNodeBlob(owner common.Hash, path []byte) []byte { + if owner == (common.Hash{}) { + return rawdb.ReadAccountTrieNode(a.db, path) + } + return rawdb.ReadStorageTrieNode(a.db, owner, path) +} + +// resolveRawNode reads and decodes a trie node directly from the raw DB. +// Unlike resolveNode, this does NOT use the trie database (no caching, +// no diff layers). The decoded node is ephemeral and will be GC'd after use. +func (a *Archiver) resolveRawNode(owner common.Hash, path []byte, hash common.Hash) (node, error) { + blob := a.readNodeBlob(owner, path) + if len(blob) == 0 { + return nil, fmt.Errorf("node not found: owner=%s path=%s", owner, common.Bytes2Hex(path)) + } + if blob[0] == expiredNodeMarker { + return &expiredNode{}, nil + } + return decodeNodeUnsafe(hash[:], blob) +} + +// archiveSubtree writes leaves to archive and replaces subtree with expiredNode. +func (a *Archiver) archiveSubtree(info *subtreeInfo) error { + if a.dryRun { + log.Info("Would archive subtree", + "path", common.Bytes2Hex(info.path), + "owner", info.owner, + "height", info.height, + "leaves", len(info.leaves), + "nodes", len(info.nodePaths)) + return nil + } + + // 1. Write to archive file + offset, size, err := a.writer.WriteSubtree(info.leaves) + if err != nil { + return fmt.Errorf("failed to write subtree to archive: %w", err) + } + + // 2. Sync to ensure durability before modifying DB + if err := a.writer.Sync(); err != nil { + return fmt.Errorf("failed to sync archive: %w", err) + } + + // 3. Verify archive round-trip: reconstruct trie from records and + // check that the hash matches the original subtree root. This + // catches any data corruption before we delete the original nodes. + if info.rootHash != (common.Hash{}) { + reconstructed, err := archiveRecordsToNode(info.leaves) + if err != nil { + return fmt.Errorf("archive verification failed: cannot reconstruct trie from records: %w", err) + } + h := newHasher(false) + gotHash := common.BytesToHash(h.hash(reconstructed, true)) + returnHasherToPool(h) + if gotHash != info.rootHash { + return fmt.Errorf("archive verification failed: hash mismatch at path %s owner %s: got %s want %s (leaves=%d offset=%d size=%d)", + common.Bytes2Hex(info.path), info.owner, gotHash, info.rootHash, + len(info.leaves), offset, size) + } + } + + // 4. Batch database operations + batch := a.db.NewBatch() + + // Delete all nodes in subtree (except the root which we'll overwrite) + for _, nodePath := range info.nodePaths[1:] { // Skip first (root) + if info.owner == (common.Hash{}) { + rawdb.DeleteAccountTrieNode(batch, nodePath) + } else { + rawdb.DeleteStorageTrieNode(batch, info.owner, nodePath) + } + a.bytesDeleted += uint64(len(nodePath)) + } + + // Write expiredNode at subtree root + expiredBlob := encodeExpiredNodeBlob(offset, size) + if info.owner == (common.Hash{}) { + rawdb.WriteAccountTrieNode(batch, info.path, expiredBlob) + } else { + rawdb.WriteStorageTrieNode(batch, info.owner, info.path, expiredBlob) + } + + if err := batch.Write(); err != nil { + return fmt.Errorf("failed to write batch: %w", err) + } + + log.Debug("Archived subtree", + "path", common.Bytes2Hex(info.path), + "owner", info.owner, + "leaves", len(info.leaves), + "offset", offset, + "size", size) + + return nil +} + +// maybeCompact runs database compaction if the threshold is reached. +func (a *Archiver) maybeCompact() error { + if a.compactionInterval == 0 { + return nil + } + if a.subtreesArchived-a.lastCompaction >= a.compactionInterval { + log.Info("Running database compaction", "subtrees", a.subtreesArchived) + if err := a.db.Compact(nil, nil); err != nil { + return err + } + a.lastCompaction = a.subtreesArchived + } + return nil +} + +// encodeExpiredNodeBlob creates the raw bytes for an expiredNode. +// Format: 1-byte marker (0x00) + 8-byte offset + 8-byte size = 17 bytes +func encodeExpiredNodeBlob(offset, size uint64) []byte { + buf := make([]byte, 1+2*archive.OffsetSize) // 17 bytes + buf[0] = expiredNodeMarker // 0x00 + binary.BigEndian.PutUint64(buf[1:], offset) + binary.BigEndian.PutUint64(buf[1+archive.OffsetSize:], size) + return buf +} + +// Stats returns archival statistics. +func (a *Archiver) Stats() (subtrees, leaves, bytesDeleted uint64) { + return a.subtreesArchived, a.leavesArchived, a.bytesDeleted +} + +// copyBytes returns a copy of the given byte slice. +func copyBytes(b []byte) []byte { + if b == nil { + return nil + } + c := make([]byte, len(b)) + copy(c, b) + return c +} diff --git a/trie/committer.go b/trie/committer.go index 2a2142e0ff..7ea4e690cf 100644 --- a/trie/committer.go +++ b/trie/committer.go @@ -79,6 +79,8 @@ func (c *committer) commit(path []byte, n node, parallel bool) node { return cn case hashNode: return cn + case *expiredNode: + return cn default: // nil, valuenode shouldn't be committed panic(fmt.Sprintf("%T: invalid node: %v", n, n)) diff --git a/trie/expired_node.go b/trie/expired_node.go new file mode 100644 index 0000000000..ce622daa03 --- /dev/null +++ b/trie/expired_node.go @@ -0,0 +1,262 @@ +// Copyright 2026 go-ethereum Authors +// This file is part of the go-ethereum library. +// +// The go-ethereum library is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// The go-ethereum library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with the go-ethereum library. If not, see . + +package trie + +import ( + "bytes" + "encoding/binary" + "fmt" + "time" + + "github.com/ethereum/go-ethereum/log" + "github.com/ethereum/go-ethereum/rlp" + "github.com/ethereum/go-ethereum/trie/archive" +) + +// expiredNodeMarker is a special marker byte to identify expired nodes. +// Using 0x00 as a marker since valid MPT nodes are always RLP lists (starting with 0xc0+). +const expiredNodeMarker = 0x00 + +// expiredNode represents a node whose data has been archived. +// It stores the file offset and size of the archived data. +type expiredNode struct { + offset uint64 + size uint64 + cachedHash hashNode + archiveResolver archive.ResolverFn +} + +func (n *expiredNode) cache() (hashNode, bool) { + return n.cachedHash, n.cachedHash == nil +} + +func (n *expiredNode) encode(w rlp.EncoderBuffer) { + var buf [1 + 2*archive.OffsetSize]byte + buf[0] = expiredNodeMarker + binary.BigEndian.PutUint64(buf[1:], n.offset) + binary.BigEndian.PutUint64(buf[1+archive.OffsetSize:], n.size) + w.Write(buf[:]) +} + +func (n *expiredNode) fstring(ind string) string { + return fmt.Sprintf(" ", n.offset, n.size) +} + +// Offset returns the archive file offset for this expired node. +func (n *expiredNode) Offset() uint64 { + return n.offset +} + +// SetArchiveResolver sets the resolver function for this expired node. +func (n *expiredNode) SetArchiveResolver(resolver archive.ResolverFn) { + n.archiveResolver = resolver +} + +// resolveExpiredNodeData resolves an expired node from the archive, verifies +// the reconstructed subtree hash, and stamps the cached hash onto the root. +// Returns an error if the archive data is corrupted (hash mismatch). +func resolveExpiredNodeData(n *expiredNode) (node, error) { + start := time.Now() + records, err := archive.ArchivedNodeResolver(n.offset, n.size) + if err != nil { + return nil, fmt.Errorf("failed to resolve expired node: %w", err) + } + resolved, err := archiveRecordsToNode(records) + if err != nil { + return nil, fmt.Errorf("failed to rebuild expired node from archive: %w", err) + } + depth := subtreeDepth(resolved) + log.Debug("Resurrected expired node from archive", + "offset", n.offset, "archiveBytes", n.size, + "records", len(records), "depth", depth, + "elapsed", time.Since(start)) + // Verify hash integrity: if the original hash is known, check that the + // reconstructed subtree produces the same hash. A mismatch means the + // archive is corrupted (e.g. missing leaves due to unresolvable hashNodes + // during archival) and any data from it is unreliable. + if n.cachedHash != nil { + h := newHasher(false) + gotHash := h.hash(resolved, true) + returnHasherToPool(h) + if !bytes.Equal(gotHash, n.cachedHash) { + return nil, fmt.Errorf("expired node hash mismatch at offset=%d size=%d: archive data is corrupted (expected %x got %x, %d records)", + n.offset, n.size, []byte(n.cachedHash), gotHash, len(records)) + } + // Stamp the original hash onto the resolved subtree root so the + // hasher returns it directly instead of re-computing. + switch nn := resolved.(type) { + case *fullNode: + nn.flags.hash = n.cachedHash + case *shortNode: + nn.flags.hash = n.cachedHash + } + } + // Mark the entire resolved subtree as dirty. This is critical for + // correctness with pathdb's diff layer model: when a trie with expired + // nodes is modified and committed, the committer only captures dirty + // nodes into the NodeSet (which becomes the diff layer). Without this + // marking, resolved-but-unmodified sibling nodes within the subtree + // would exist nowhere — not in any diff layer (they're clean) and not + // in the raw DB (the archiver deleted them). Subsequent trie accesses + // from higher diff layers would fall through to the disk layer, find + // nothing, and produce MissingNodeError. + // + // For read-only tries (only get operations, no commit), this dirty + // marking is harmless — the nodes are discarded when the trie is GC'd. + markSubtreeDirty(resolved) + return resolved, nil +} + +// subtreeDepth returns the maximum depth of a trie subtree. +func subtreeDepth(n node) int { + switch n := n.(type) { + case *fullNode: + max := 0 + for _, child := range &n.Children { + if child != nil { + if d := subtreeDepth(child); d > max { + max = d + } + } + } + return 1 + max + case *shortNode: + return 1 + subtreeDepth(n.Val) + default: + return 0 + } +} + +// markSubtreeDirty recursively marks all fullNode and shortNode in the +// subtree as dirty, preserving any cached hashes. This ensures the +// committer will capture them in the NodeSet during trie commit. +func markSubtreeDirty(n node) { + switch n := n.(type) { + case *fullNode: + n.flags.dirty = true + for _, child := range n.Children[:16] { + if child != nil { + markSubtreeDirty(child) + } + } + case *shortNode: + n.flags.dirty = true + markSubtreeDirty(n.Val) + } + // valueNode, hashNode, nil: no flags to mark +} + +func archiveRecordsToNode(records []*archive.Record) (node, error) { + if len(records) == 0 { + return nil, archive.EmptyArchiveRecord + } + + // Build the trie incrementally from nil to produce the canonical + // MPT structure. Starting with a fullNode would be wrong when the + // original subtree root was a shortNode (shared prefix). + var root node + for i, record := range records { + if err := validateRecordPath(record.Path); err != nil { + return nil, err + } + + key, err := normalizeRecordKey(record.Path) + if err != nil { + return nil, err + } + if len(key) < 1 { + return nil, fmt.Errorf("empty key in record #%d", i) + } + root, err = insertTrieNode(root, key, valueNode(record.Value)) + if err != nil { + return nil, err + } + } + return root, nil +} + +func validateRecordPath(path []byte) error { + for i, b := range path { + if b > 16 { + return fmt.Errorf("invalid nibble in record path: %d", b) + } + if b == 16 && i != len(path)-1 { + return fmt.Errorf("terminator nibble in middle of record path") + } + } + return nil +} + +// normalizeRecordKey ensures the record path is a hex-nibble key suitable for +// leaf insertion by guaranteeing a single terminator nibble and preserving any +// already-terminated path. Empty paths are normalized to a sole terminator. +func normalizeRecordKey(path []byte) ([]byte, error) { + if len(path) == 0 { + return []byte{16}, nil + } + if hasTerm(path) { + return path, nil + } + key := append([]byte{}, path...) + key = append(key, 16) + return key, nil +} + +func insertTrieNode(n node, key []byte, value node) (node, error) { + if len(key) == 0 { + return value, nil + } + switch n := n.(type) { + case *shortNode: + matchlen := prefixLen(key, n.Key) + if matchlen == len(n.Key) { + nn, err := insertTrieNode(n.Val, key[matchlen:], value) + if err != nil { + return nil, err + } + return &shortNode{Key: n.Key, Val: nn}, nil + } + branch := &fullNode{} + var err error + branch.Children[n.Key[matchlen]], err = insertTrieNode(nil, n.Key[matchlen+1:], n.Val) + if err != nil { + return nil, err + } + branch.Children[key[matchlen]], err = insertTrieNode(nil, key[matchlen+1:], value) + if err != nil { + return nil, err + } + if matchlen == 0 { + return branch, nil + } + return &shortNode{Key: key[:matchlen], Val: branch}, nil + + case *fullNode: + child, err := insertTrieNode(n.Children[key[0]], key[1:], value) + if err != nil { + return nil, err + } + n.Children[key[0]] = child + return n, nil + + case nil: + return &shortNode{Key: key, Val: value}, nil + + default: + return nil, fmt.Errorf("invalid node type in trie insert: %T", n) + } +} diff --git a/trie/expired_node_test.go b/trie/expired_node_test.go new file mode 100644 index 0000000000..40b3b056b4 --- /dev/null +++ b/trie/expired_node_test.go @@ -0,0 +1,601 @@ +// Copyright 2026 go-ethereum Authors +// This file is part of the go-ethereum library. +// +// The go-ethereum library is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// The go-ethereum library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with the go-ethereum library. If not, see . + +package trie + +import ( + "bytes" + "errors" + "os" + "path/filepath" + "testing" + + "github.com/ethereum/go-ethereum/rlp" + "github.com/ethereum/go-ethereum/trie/archive" +) + +// setupTestArchive creates a temporary archive directory with an archive file +// containing the given records, and configures archive.ArchiveDataDir to point +// to it. It returns the offset and size of the written data, and a cleanup function. +func setupTestArchive(t *testing.T, records []*archive.Record) (offset, size uint64, cleanup func()) { + t.Helper() + tmpDir := t.TempDir() + gethDir := filepath.Join(tmpDir, "geth") + if err := os.MkdirAll(gethDir, 0755); err != nil { + t.Fatal(err) + } + + writer, err := archive.NewArchiveWriter(filepath.Join(gethDir, "nodearchive")) + if err != nil { + t.Fatal(err) + } + + offset, size, err = writer.WriteSubtree(records) + if err != nil { + writer.Close() + t.Fatal(err) + } + writer.Close() + + oldDir := archive.ArchiveDataDir + archive.ArchiveDataDir = tmpDir + + return offset, size, func() { + archive.ArchiveDataDir = oldDir + } +} + +func TestExpiredNodeEncodeDecode(t *testing.T) { + testCases := []struct { + offset uint64 + size uint64 + }{ + {0, 0}, + {1, 100}, + {255, 1024}, + {256, 4096}, + {1 << 16, 1 << 20}, + {1 << 32, 1 << 32}, + {1<<64 - 1, 1<<64 - 1}, + } + + for _, tc := range testCases { + original := &expiredNode{offset: tc.offset, size: tc.size} + + w := rlp.NewEncoderBuffer(nil) + original.encode(w) + encoded := w.ToBytes() + w.Flush() + + decoded, err := decodeNodeUnsafe(nil, encoded) + if err != nil { + t.Fatalf("failed to decode expired node with offset %d, size %d: %v", tc.offset, tc.size, err) + } + + expNode, ok := decoded.(*expiredNode) + if !ok { + t.Fatalf("decoded node is not an expired node, got %T", decoded) + } + + if expNode.offset != original.offset { + t.Errorf("offset mismatch: got %d, want %d", expNode.offset, original.offset) + } + if expNode.size != original.size { + t.Errorf("size mismatch: got %d, want %d", expNode.size, original.size) + } + } +} + +func TestExpiredNodeEncodedFormat(t *testing.T) { + node := &expiredNode{offset: 0x0102030405060708, size: 0x1112131415161718} + + w := rlp.NewEncoderBuffer(nil) + node.encode(w) + encoded := w.ToBytes() + w.Flush() + + expected := []byte{ + 0x00, + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, + 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, + } + if !bytes.Equal(encoded, expected) { + t.Errorf("encoded format mismatch: got %x, want %x", encoded, expected) + } +} + +func TestExpiredNodeFstring(t *testing.T) { + node := &expiredNode{offset: 12345, size: 6789} + s := node.fstring("") + if s != " " { + t.Errorf("fstring mismatch: got %q", s) + } +} + +func TestExpiredNodeCache(t *testing.T) { + node := &expiredNode{offset: 100} + hash, dirty := node.cache() + if hash != nil { + t.Error("expected nil hash from expired node cache") + } + if !dirty { + t.Error("expected dirty=true from expired node cache") + } +} + +func TestExpiredNodeInvalidLength(t *testing.T) { + invalidCases := [][]byte{ + {0x00}, + {0x00, 0x01}, + {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08}, + {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f}, + {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11}, + } + + for _, buf := range invalidCases { + _, err := decodeNodeUnsafe(nil, buf) + if err == nil { + t.Errorf("expected error for buffer length %d, got nil", len(buf)) + } + } +} + +func TestExpiredNodeNoArchiveFile(t *testing.T) { + // When no archive file exists, Get should return an error + tmpDir := t.TempDir() + gethDir := filepath.Join(tmpDir, "geth") + if err := os.MkdirAll(gethDir, 0755); err != nil { + t.Fatal(err) + } + + oldDir := archive.ArchiveDataDir + archive.ArchiveDataDir = tmpDir + defer func() { archive.ArchiveDataDir = oldDir }() + + tr := NewEmpty(nil) + tr.root = &expiredNode{offset: 100, size: 50} + + _, err := tr.Get([]byte("key")) + if err == nil { + t.Error("expected error when archive file doesn't exist") + } +} + +func TestExpiredNodeWithResolver(t *testing.T) { + records := []*archive.Record{ + {Path: []byte{0x01, 0x02, 16}, Value: []byte("testvalue")}, + } + offset, size, cleanup := setupTestArchive(t, records) + defer cleanup() + + tr := NewEmpty(nil) + tr.root = &expiredNode{offset: offset, size: size} + + val, err := tr.Get([]byte{0x12}) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if string(val) != "testvalue" { + t.Errorf("value mismatch: got %q, want %q", val, "testvalue") + } +} + +func TestExpiredNodeCopy(t *testing.T) { + original := &expiredNode{ + offset: 12345, + size: 6789, + archiveResolver: archive.ArchivedNodeResolver, + } + + copied := copyNode(original) + copiedExp, ok := copied.(*expiredNode) + if !ok { + t.Fatalf("copied node is not an expired node, got %T", copied) + } + + if copiedExp.offset != original.offset { + t.Errorf("offset mismatch: got %d, want %d", copiedExp.offset, original.offset) + } + + if copiedExp.size != original.size { + t.Errorf("size mismatch: got %d, want %d", copiedExp.size, original.size) + } + + if copiedExp.archiveResolver == nil { + t.Error("archive resolver was not copied") + } +} + +func TestArchiveRecordsToNodeEmpty(t *testing.T) { + _, err := archiveRecordsToNode([]*archive.Record{}) + if !errors.Is(err, archive.EmptyArchiveRecord) { + t.Errorf("expected EmptyArchiveRecord error, got %v", err) + } + + _, err = archiveRecordsToNode(nil) + if !errors.Is(err, archive.EmptyArchiveRecord) { + t.Errorf("expected EmptyArchiveRecord error for nil slice, got %v", err) + } +} + +func TestArchiveRecordsToNodeMultiple(t *testing.T) { + records := []*archive.Record{ + {Path: []byte{0x01, 16}, Value: []byte("value1")}, + {Path: []byte{0x02, 16}, Value: []byte("value2")}, + } + + node, err := archiveRecordsToNode(records) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + fn, ok := node.(*fullNode) + if !ok { + t.Fatalf("expected fullNode, got %T", node) + } + + if fn.Children[0x01] == nil { + t.Error("expected child at index 0x01") + } + if fn.Children[0x02] == nil { + t.Error("expected child at index 0x02") + } +} + +func TestExpiredNodeGetMultipleRecords(t *testing.T) { + records := []*archive.Record{ + {Path: []byte{0x01, 0x02, 16}, Value: []byte("value1")}, + {Path: []byte{0x04, 0x05, 16}, Value: []byte("value2")}, + } + offset, size, cleanup := setupTestArchive(t, records) + defer cleanup() + + tr := NewEmpty(nil) + tr.root = &expiredNode{offset: offset, size: size} + + val, err := tr.Get([]byte{0x12}) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if string(val) != "value1" { + t.Errorf("value mismatch: got %q, want %q", val, "value1") + } + + tr2 := NewEmpty(nil) + tr2.root = &expiredNode{offset: offset, size: size} + + val2, err := tr2.Get([]byte{0x45}) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if string(val2) != "value2" { + t.Errorf("value mismatch: got %q, want %q", val2, "value2") + } +} + +func TestExpiredNodeGetKeyNotFound(t *testing.T) { + records := []*archive.Record{ + {Path: []byte{0x01, 0x02, 16}, Value: []byte("value1")}, + } + offset, size, cleanup := setupTestArchive(t, records) + defer cleanup() + + tr := NewEmpty(nil) + tr.root = &expiredNode{offset: offset, size: size} + + val, err := tr.Get([]byte{0xff, 0xff}) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if val != nil { + t.Errorf("expected nil value for non-existent key, got %q", val) + } +} + +func TestExpiredNodeGetPathMismatch(t *testing.T) { + records := []*archive.Record{ + {Path: []byte{0x01, 0x02, 16}, Value: []byte("testvalue")}, + } + offset, size, cleanup := setupTestArchive(t, records) + defer cleanup() + + tr := NewEmpty(nil) + tr.root = &expiredNode{offset: offset, size: size} + + val, err := tr.Get([]byte{0x19}) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if val != nil { + t.Errorf("expected nil value when leaf key doesn't match, got %q", val) + } +} + +func TestExpiredNodeInsert(t *testing.T) { + records := []*archive.Record{ + {Path: []byte{0x01, 0x02, 16}, Value: []byte("existing")}, + } + offset, size, cleanup := setupTestArchive(t, records) + defer cleanup() + + tr := NewEmpty(nil) + tr.root = &expiredNode{offset: offset, size: size} + + err := tr.Update([]byte{0x45}, []byte("newvalue")) + if err != nil { + t.Fatalf("unexpected error on insert: %v", err) + } + + val, err := tr.Get([]byte{0x45}) + if err != nil { + t.Fatalf("unexpected error on get: %v", err) + } + if string(val) != "newvalue" { + t.Errorf("value mismatch: got %q, want %q", val, "newvalue") + } +} + +func TestExpiredNodeUpdate(t *testing.T) { + records := []*archive.Record{ + {Path: []byte{0x01, 0x02, 16}, Value: []byte("oldvalue")}, + } + offset, size, cleanup := setupTestArchive(t, records) + defer cleanup() + + tr := NewEmpty(nil) + tr.root = &expiredNode{offset: offset, size: size} + + err := tr.Update([]byte{0x12}, []byte("newvalue")) + if err != nil { + t.Fatalf("unexpected error on update: %v", err) + } + + val, err := tr.Get([]byte{0x12}) + if err != nil { + t.Fatalf("unexpected error on get: %v", err) + } + if string(val) != "newvalue" { + t.Errorf("value mismatch: got %q, want %q", val, "newvalue") + } +} + +func TestExpiredNodeDelete(t *testing.T) { + records := []*archive.Record{ + {Path: []byte{0x01, 0x02, 16}, Value: []byte("value1")}, + {Path: []byte{0x04, 0x05, 16}, Value: []byte("value2")}, + } + offset, size, cleanup := setupTestArchive(t, records) + defer cleanup() + + tr := NewEmpty(nil) + tr.root = &expiredNode{offset: offset, size: size} + + err := tr.Delete([]byte{0x12}) + if err != nil { + t.Fatalf("unexpected error on delete: %v", err) + } + + val, err := tr.Get([]byte{0x12}) + if err != nil { + t.Fatalf("unexpected error on get after delete: %v", err) + } + if val != nil { + t.Errorf("expected nil after delete, got %q", val) + } + + val2, err := tr.Get([]byte{0x45}) + if err != nil { + t.Fatalf("unexpected error getting other key: %v", err) + } + if string(val2) != "value2" { + t.Errorf("other value should still exist: got %q, want %q", val2, "value2") + } +} + +func TestTrieCopyPreservesArchiveResolver(t *testing.T) { + records := []*archive.Record{ + {Path: []byte{0x01, 0x02, 16}, Value: []byte("testvalue")}, + } + offset, size, cleanup := setupTestArchive(t, records) + defer cleanup() + + tr := NewEmpty(nil) + tr.root = &expiredNode{offset: offset, size: size} + + trCopy := tr.Copy() + + val, err := trCopy.Get([]byte{0x12}) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if string(val) != "testvalue" { + t.Errorf("value mismatch: got %q, want %q", val, "testvalue") + } +} + +func TestWalkWithExpiredNodes(t *testing.T) { + records := []*archive.Record{ + {Path: []byte{0x01, 0x02, 16}, Value: []byte("value1")}, + {Path: []byte{0x04, 0x05, 16}, Value: []byte("value2")}, + {Path: []byte{0x07, 0x08, 16}, Value: []byte("value3")}, + } + offset, size, cleanup := setupTestArchive(t, records) + defer cleanup() + + tr := NewEmpty(nil) + tr.root = &expiredNode{offset: offset, size: size} + + var leaves []string + stats, err := tr.Walk(func(path []byte, value []byte) error { + leaves = append(leaves, string(value)) + return nil + }) + if err != nil { + t.Fatalf("Walk failed: %v", err) + } + if stats.Leaves != 3 { + t.Errorf("expected 3 leaves, got %d", stats.Leaves) + } + if stats.ExpiredResolved != 1 { + t.Errorf("expected 1 expired resolved, got %d", stats.ExpiredResolved) + } + // Verify all values were visited + expected := map[string]bool{"value1": true, "value2": true, "value3": true} + for _, leaf := range leaves { + if !expected[leaf] { + t.Errorf("unexpected leaf value: %q", leaf) + } + delete(expected, leaf) + } + if len(expected) > 0 { + t.Errorf("missing leaves: %v", expected) + } +} + +func TestWalkEmptyTrie(t *testing.T) { + tr := NewEmpty(nil) + stats, err := tr.Walk(func(path []byte, value []byte) error { + t.Error("callback should not be called for empty trie") + return nil + }) + if err != nil { + t.Fatalf("Walk failed: %v", err) + } + if stats.Leaves != 0 || stats.ExpiredResolved != 0 { + t.Errorf("expected zero stats for empty trie, got leaves=%d expired=%d", stats.Leaves, stats.ExpiredResolved) + } +} + +func TestWalkCallbackError(t *testing.T) { + records := []*archive.Record{ + {Path: []byte{0x01, 0x02, 16}, Value: []byte("value1")}, + } + offset, size, cleanup := setupTestArchive(t, records) + defer cleanup() + + tr := NewEmpty(nil) + tr.root = &expiredNode{offset: offset, size: size} + + testErr := errors.New("test error") + _, err := tr.Walk(func(path []byte, value []byte) error { + return testErr + }) + if !errors.Is(err, testErr) { + t.Fatalf("expected test error, got %v", err) + } +} + +// TestExpiredNodeResolvedSubtreeDirty verifies that when an expired node is +// resolved and a sibling leaf is modified, the commit captures ALL resolved +// nodes (not just the modified path). Without this fix, resolved-but-unmodified +// nodes would be lost: not in the diff layer (clean) and not in the raw DB +// (deleted by archiver). +func TestExpiredNodeResolvedSubtreeDirty(t *testing.T) { + // Use large values (>32 bytes) so leaf nodes are NOT embedded in + // their parent. This matches production storage tries where + // intermediate nodes are large enough to be stored independently. + bigVal1 := bytes.Repeat([]byte("A"), 40) + bigVal2 := bytes.Repeat([]byte("B"), 40) + + // Create an archive with records under different branches. + records := []*archive.Record{ + {Path: []byte{0x01, 0x02, 16}, Value: bigVal1}, + {Path: []byte{0x04, 0x05, 16}, Value: bigVal2}, + } + offset, size, cleanup := setupTestArchive(t, records) + defer cleanup() + + tr := NewEmpty(nil) + tr.root = &expiredNode{offset: offset, size: size} + + // Insert a value that goes through one branch of the resolved subtree. + // This modifies path [1, ...] but leaves path [4, ...] unmodified. + if err := tr.Update([]byte{0x12}, bytes.Repeat([]byte("C"), 40)); err != nil { + t.Fatalf("Update failed: %v", err) + } + + // Commit the trie. The NodeSet should be non-nil because we modified data. + _, nodes := tr.Commit(false) + if nodes == nil { + t.Fatal("expected non-nil NodeSet after modifying expired subtree") + } + + // The resolved-but-unmodified sibling (path [4, 5]) should also be + // captured in the NodeSet, because markSubtreeDirty ensures all resolved + // nodes are dirty. Count the nodes to verify. + nodeCount := len(nodes.Nodes) + // We expect at least 3 nodes: the root, the modified branch, and the + // sibling branch. The exact count depends on trie structure. + if nodeCount < 3 { + t.Errorf("expected at least 3 nodes in NodeSet (root + modified + sibling), got %d", nodeCount) + } +} + +// TestMarkSubtreeDirty verifies that markSubtreeDirty correctly sets the dirty +// flag on all nodes in a subtree while preserving cached hashes. +func TestMarkSubtreeDirty(t *testing.T) { + // Build a small trie structure + leaf1 := &shortNode{Key: []byte{1, 16}, Val: valueNode("v1")} + leaf2 := &shortNode{Key: []byte{2, 16}, Val: valueNode("v2")} + branch := &fullNode{} + branch.Children[1] = leaf1 + branch.Children[2] = leaf2 + + // Set hash but not dirty (as if loaded from DB) + branch.flags = nodeFlag{hash: hashNode("testhash"), dirty: false} + leaf1.flags = nodeFlag{hash: hashNode("hash1"), dirty: false} + leaf2.flags = nodeFlag{hash: hashNode("hash2"), dirty: false} + + markSubtreeDirty(branch) + + // All nodes should be dirty + if !branch.flags.dirty { + t.Error("branch should be dirty") + } + if !leaf1.flags.dirty { + t.Error("leaf1 should be dirty") + } + if !leaf2.flags.dirty { + t.Error("leaf2 should be dirty") + } + + // Hashes should be preserved + if !bytes.Equal(branch.flags.hash, hashNode("testhash")) { + t.Error("branch hash should be preserved") + } + if !bytes.Equal(leaf1.flags.hash, hashNode("hash1")) { + t.Error("leaf1 hash should be preserved") + } + if !bytes.Equal(leaf2.flags.hash, hashNode("hash2")) { + t.Error("leaf2 hash should be preserved") + } +} + +func TestExpiredNodeGetNode(t *testing.T) { + records := []*archive.Record{ + {Path: []byte{0x01, 0x02, 16}, Value: []byte("testvalue")}, + } + offset, size, cleanup := setupTestArchive(t, records) + defer cleanup() + + tr := NewEmpty(nil) + tr.root = &expiredNode{offset: offset, size: size} + + _, _, err := tr.GetNode(hexToCompact([]byte{0x01, 0x02})) + if err != nil && err.Error() != "non-consensus node" { + t.Fatalf("unexpected error: %v", err) + } +} diff --git a/trie/hasher.go b/trie/hasher.go index a2a1f5b662..d4376e12e2 100644 --- a/trie/hasher.go +++ b/trie/hasher.go @@ -18,6 +18,7 @@ package trie import ( "bytes" + "encoding/binary" "fmt" "sync" @@ -97,6 +98,22 @@ func (h *hasher) hash(n node, force bool) []byte { // hash nodes don't have children, so they're left as were return n + case *expiredNode: + // Return the original subtree hash that was cached when the + // expired node was decoded. The parent node references this + // hash, so we must return the same value to keep the Merkle + // root consistent. + if n.cachedHash != nil { + return n.cachedHash + } + // Fallback: hash the marker blob (should not happen in practice + // because decodeNodeUnsafe always provides the hash). + var buf [1 + 2*8]byte // 17 bytes + buf[0] = expiredNodeMarker + binary.BigEndian.PutUint64(buf[1:], n.offset) + binary.BigEndian.PutUint64(buf[9:], n.size) + return h.hashData(buf[:]) + default: panic(fmt.Errorf("unexpected node type, %T", n)) } @@ -214,6 +231,12 @@ func (h *hasher) proofHash(original node) []byte { return bytes.Clone(h.encodeShortNode(n)) case *fullNode: return bytes.Clone(h.encodeFullNode(n)) + case *expiredNode: + var buf [1 + 2*8]byte + buf[0] = expiredNodeMarker + binary.BigEndian.PutUint64(buf[1:], n.offset) + binary.BigEndian.PutUint64(buf[9:], n.size) + return buf[:] default: panic(fmt.Errorf("unexpected node type, %T", original)) } diff --git a/trie/node.go b/trie/node.go index b5094ff4b7..f9e0840c1d 100644 --- a/trie/node.go +++ b/trie/node.go @@ -18,13 +18,16 @@ package trie import ( "bytes" + "encoding/binary" "fmt" "io" "strings" "github.com/ethereum/go-ethereum/common" "github.com/ethereum/go-ethereum/crypto" + "github.com/ethereum/go-ethereum/log" "github.com/ethereum/go-ethereum/rlp" + "github.com/ethereum/go-ethereum/trie/archive" ) var indices = []string{"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "a", "b", "c", "d", "e", "f", "[17]"} @@ -158,6 +161,15 @@ func decodeNodeUnsafe(hash, buf []byte) (node, error) { if len(buf) == 0 { return nil, io.ErrUnexpectedEOF } + if buf[0] == expiredNodeMarker { + if len(buf) != 1+2*archive.OffsetSize { + return nil, fmt.Errorf("invalid expired node length: %d", len(buf)) + } + offset := binary.BigEndian.Uint64(buf[1:]) + size := binary.BigEndian.Uint64(buf[1+archive.OffsetSize:]) + log.Debug("Decoded expired node", "offset", offset, "size", size, "hash", common.BytesToHash(hash)) + return &expiredNode{offset: offset, size: size, cachedHash: hashNode(hash), archiveResolver: archive.ArchivedNodeResolver}, nil + } elems, _, err := rlp.SplitList(buf) if err != nil { return nil, fmt.Errorf("decode error: %v", err) diff --git a/trie/proof.go b/trie/proof.go index 58075daf9b..5be05c6f81 100644 --- a/trie/proof.go +++ b/trie/proof.go @@ -25,6 +25,7 @@ import ( "github.com/ethereum/go-ethereum/crypto" "github.com/ethereum/go-ethereum/ethdb" "github.com/ethereum/go-ethereum/log" + "github.com/ethereum/go-ethereum/trie/archive" ) // Prove constructs a merkle proof for key. The result contains all encoded nodes @@ -78,6 +79,16 @@ func (t *Trie) Prove(key []byte, proofDb ethdb.KeyValueWriter) error { // clean cache or the database, they are all in their own // copy and safe to use unsafe decoder. tn = mustDecodeNodeUnsafe(n, blob) + case *expiredNode: + records, err := archive.ArchivedNodeResolver(n.offset, n.size) + if err != nil { + return fmt.Errorf("failed to resolve expired node in proof: %w", err) + } + resolved, err := archiveRecordsToNode(records) + if err != nil { + return fmt.Errorf("failed to rebuild expired node in proof: %w", err) + } + tn = resolved default: panic(fmt.Sprintf("%T: invalid node: %v", tn, tn)) } @@ -617,6 +628,8 @@ func get(tn node, key []byte, skipResolved bool) ([]byte, node) { } case hashNode: return key, n + case *expiredNode: + return key, n case nil: return key, nil case valueNode: diff --git a/trie/trie.go b/trie/trie.go index 1ef2c2f1a6..69db68b515 100644 --- a/trie/trie.go +++ b/trie/trie.go @@ -26,6 +26,7 @@ import ( "github.com/ethereum/go-ethereum/common" "github.com/ethereum/go-ethereum/core/types" "github.com/ethereum/go-ethereum/log" + "github.com/ethereum/go-ethereum/trie/archive" "github.com/ethereum/go-ethereum/trie/trienode" "github.com/ethereum/go-ethereum/triedb/database" "golang.org/x/sync/errgroup" @@ -57,6 +58,10 @@ type Trie struct { // reader is the handler trie can retrieve nodes from. reader *Reader + // archiveResolver is an optional callback to resolve expired nodes from + // an archive file. + archiveResolver archive.ResolverFn + // Various tracers for capturing the modifications to trie opTracer *opTracer prevalueTracer *PrevalueTracer @@ -70,17 +75,23 @@ func (t *Trie) newFlag() nodeFlag { // Copy returns a copy of Trie. func (t *Trie) Copy() *Trie { return &Trie{ - root: copyNode(t.root), - owner: t.owner, - committed: t.committed, - unhashed: t.unhashed, - uncommitted: t.uncommitted, - reader: t.reader, - opTracer: t.opTracer.copy(), - prevalueTracer: t.prevalueTracer.Copy(), + root: copyNode(t.root), + owner: t.owner, + committed: t.committed, + unhashed: t.unhashed, + uncommitted: t.uncommitted, + reader: t.reader, + archiveResolver: t.archiveResolver, + opTracer: t.opTracer.copy(), + prevalueTracer: t.prevalueTracer.Copy(), } } +// SetArchiveResolver sets the archive resolver callback for expired nodes. +func (t *Trie) SetArchiveResolver(resolver archive.ResolverFn) { + t.archiveResolver = resolver +} + // New creates the trie instance with provided trie id and the read-only // database. The state specified by trie id must be available, otherwise // an error will be returned. The trie root specified by trie id can be @@ -218,6 +229,14 @@ func (t *Trie) get(origNode node, key []byte, pos int) (value []byte, newnode no } value, newnode, _, err := t.get(child, key, pos) return value, newnode, true, err + case *expiredNode: + log.Debug("Resolving expired node in get()", "owner", t.owner, "offset", n.offset, "size", n.size, "pos", pos) + newnode, err := resolveExpiredNodeData(n) + if err != nil { + return nil, n, false, err + } + value, _, _, err = t.get(newnode, key, pos) + return value, newnode, true, err default: panic(fmt.Sprintf("%T: invalid node: %v", origNode, origNode)) } @@ -352,6 +371,14 @@ func (t *Trie) getNode(origNode node, path []byte, pos int) (item []byte, newnod item, newnode, resolved, err := t.getNode(child, path, pos) return item, newnode, resolved + 1, err + case *expiredNode: + rn, err := resolveExpiredNodeData(n) + if err != nil { + return nil, n, 0, err + } + item, newnode, resolvedCount, err := t.getNode(rn, path, pos) + return item, newnode, resolvedCount + 1, err + default: panic(fmt.Sprintf("%T: invalid node: %v", origNode, origNode)) } @@ -475,6 +502,18 @@ func (t *Trie) insert(n node, prefix, key []byte, value node) (bool, node, error } return true, nn, nil + case *expiredNode: + log.Debug("Resolving expired node in insert()", "owner", t.owner, "offset", n.offset, "size", n.size) + rn, err := resolveExpiredNodeData(n) + if err != nil { + return false, nil, err + } + dirty, nn, err := t.insert(rn, prefix, key, value) + if !dirty || err != nil { + return false, rn, err + } + return true, nn, nil + default: panic(fmt.Sprintf("%T: invalid node: %v", n, n)) } @@ -636,6 +675,18 @@ func (t *Trie) delete(n node, prefix, key []byte) (bool, node, error) { } return true, nn, nil + case *expiredNode: + log.Debug("Resolving expired node in delete()", "owner", t.owner, "offset", n.offset, "size", n.size) + rn, err := resolveExpiredNodeData(n) + if err != nil { + return false, nil, err + } + dirty, nn, err := t.delete(rn, prefix, key) + if !dirty || err != nil { + return false, rn, err + } + return true, nn, nil + default: panic(fmt.Sprintf("%T: invalid node: %v (%v)", n, n, key)) } @@ -666,14 +717,24 @@ func copyNode(n node) node { } case hashNode: return n + case *expiredNode: + return &expiredNode{ + offset: n.offset, + size: n.size, + cachedHash: common.CopyBytes(n.cachedHash), + archiveResolver: n.archiveResolver, + } default: panic(fmt.Sprintf("%T: unknown node type", n)) } } func (t *Trie) resolve(n node, prefix []byte) (node, error) { - if n, ok := n.(hashNode); ok { + switch n := n.(type) { + case hashNode: return t.resolveAndTrack(n, prefix) + case *expiredNode: + return resolveExpiredNodeData(n) } return n, nil } @@ -784,6 +845,58 @@ func (t *Trie) Witness() map[string][]byte { return t.prevalueTracer.Values() } +// WalkStats holds statistics from a Walk traversal. +type WalkStats struct { + Leaves int // Number of leaf nodes visited + ExpiredResolved int // Number of expired nodes resolved from archive +} + +// Walk recursively traverses the trie, resolving all nodes including +// hashNodes and expiredNodes. It calls fn for each leaf found. +// This triggers hash verification for expired nodes via cachedHash. +func (t *Trie) Walk(fn func(path []byte, value []byte) error) (WalkStats, error) { + return t.walk(t.root, nil, fn) +} + +func (t *Trie) walk(n node, path []byte, fn func([]byte, []byte) error) (WalkStats, error) { + switch n := n.(type) { + case *shortNode: + return t.walk(n.Val, append(append([]byte{}, path...), n.Key...), fn) + case *fullNode: + var stats WalkStats + for i, child := range n.Children[:16] { + if child != nil { + childStats, err := t.walk(child, append(append([]byte{}, path...), byte(i)), fn) + if err != nil { + return stats, err + } + stats.Leaves += childStats.Leaves + stats.ExpiredResolved += childStats.ExpiredResolved + } + } + return stats, nil + case hashNode: + resolved, err := t.resolveAndTrack(n, path) + if err != nil { + return WalkStats{}, err + } + return t.walk(resolved, path, fn) + case *expiredNode: + resolved, err := resolveExpiredNodeData(n) + if err != nil { + return WalkStats{}, err + } + childStats, err := t.walk(resolved, path, fn) + childStats.ExpiredResolved++ + return childStats, err + case valueNode: + return WalkStats{Leaves: 1}, fn(path, []byte(n)) + case nil: + return WalkStats{}, nil + } + return WalkStats{}, nil +} + // reset drops the referenced root node and cleans all internal state. func (t *Trie) reset() { t.root = nil diff --git a/triedb/database.go b/triedb/database.go index ef95169df1..71b578367b 100644 --- a/triedb/database.go +++ b/triedb/database.go @@ -399,6 +399,28 @@ func (db *Database) Disk() ethdb.Database { return db.disk } +// DiffHead returns the root hash of the topmost diff layer in pathdb. +// If there are no diff layers or the backend is not pathdb, it returns +// the zero hash and false. +func (db *Database) DiffHead() (common.Hash, bool) { + pdb, ok := db.backend.(*pathdb.Database) + if !ok { + return common.Hash{}, false + } + return pdb.DiffHead() +} + +// DisableStateHistory closes and disables the state history freezer. +// This is used by the archiver to bypass state history writes during +// diff layer flushing when state history may have gaps. +func (db *Database) DisableStateHistory() { + pdb, ok := db.backend.(*pathdb.Database) + if !ok { + return + } + pdb.DisableStateHistory() +} + // SnapshotCompleted returns the indicator if the snapshot is completed. func (db *Database) SnapshotCompleted() bool { pdb, ok := db.backend.(*pathdb.Database) diff --git a/triedb/pathdb/database.go b/triedb/pathdb/database.go index e52949c93e..ba606552df 100644 --- a/triedb/pathdb/database.go +++ b/triedb/pathdb/database.go @@ -318,6 +318,30 @@ func (db *Database) Update(root common.Hash, parentRoot common.Hash, block uint6 return db.tree.cap(root, maxDiffLayers) } +// DiffHead returns the root hash of the topmost diff layer. If there are no +// diff layers (only the disk layer), it returns the disk layer root and false. +func (db *Database) DiffHead() (common.Hash, bool) { + db.lock.RLock() + defer db.lock.RUnlock() + + return db.tree.diffHead() +} + +// DisableStateHistory closes and disables the state history freezer. This is +// used by the archiver to bypass state history writes during diff layer flushing, +// since the archiver only needs trie nodes committed to disk and state history +// may have gaps from unclean shutdowns that prevent sequential appends. +func (db *Database) DisableStateHistory() { + db.lock.Lock() + defer db.lock.Unlock() + + if db.stateFreezer != nil { + db.stateFreezer.Close() + db.stateFreezer = nil + log.Info("Disabled state history freezer") + } +} + // Commit traverses downwards the layer tree from a specified layer with the // provided state root and all the layers below are flattened downwards. It // can be used alone and mostly for test purposes. diff --git a/triedb/pathdb/history.go b/triedb/pathdb/history.go index 55ec29e4f0..4730802d9c 100644 --- a/triedb/pathdb/history.go +++ b/triedb/pathdb/history.go @@ -278,9 +278,17 @@ func truncateFromHead(store ethdb.AncientStore, typ historyType, nhead uint64) ( return 0, err } // Ensure that the truncation target falls within the valid range. - if ohead < nhead || nhead < otail { + if nhead < otail { return 0, fmt.Errorf("%w, %s, tail: %d, head: %d, target: %d", errHeadTruncationOutOfRange, typ, otail, ohead, nhead) } + // If the target is ahead of the current head, there's nothing to truncate. + // This can happen after unclean shutdowns where the state history was not + // fully written. + if ohead < nhead { + log.Warn("State history shorter than target, nothing to truncate", + "type", typ.String(), "head", ohead, "target", nhead) + return 0, nil + } // Short circuit if nothing to truncate. if ohead == nhead { return 0, nil diff --git a/triedb/pathdb/history_state_test.go b/triedb/pathdb/history_state_test.go index 5c3026a571..3e75d3baa8 100644 --- a/triedb/pathdb/history_state_test.go +++ b/triedb/pathdb/history_state_test.go @@ -244,8 +244,8 @@ func TestTruncateOutOfRange(t *testing.T) { target uint64 expErr error }{ - {0, head, nil}, // nothing to delete - {0, head + 1, errHeadTruncationOutOfRange}, + {0, head, nil}, // nothing to delete + {0, head + 1, nil}, // gracefully handled after unclean shutdown {0, tail - 1, errHeadTruncationOutOfRange}, {1, tail, nil}, // nothing to delete {1, head + 1, errTailTruncationOutOfRange}, diff --git a/triedb/pathdb/layertree.go b/triedb/pathdb/layertree.go index b20e40bd05..99fd23a2a1 100644 --- a/triedb/pathdb/layertree.go +++ b/triedb/pathdb/layertree.go @@ -31,6 +31,7 @@ import ( // of the referenced layer by themselves. type layerTree struct { base *diskLayer + head common.Hash // Root hash of the topmost layer (diff or disk) layers map[common.Hash]layer // descendants is a two-dimensional map where the keys represent @@ -59,6 +60,7 @@ func (tree *layerTree) init(head layer) { defer tree.lock.Unlock() current := head + tree.head = head.rootHash() tree.layers = make(map[common.Hash]layer) tree.descendants = make(map[common.Hash]map[common.Hash]struct{}) @@ -76,6 +78,18 @@ func (tree *layerTree) init(head layer) { tree.lookup = newLookup(head, tree.isDescendant) } +// diffHead returns the root hash of the topmost diff layer. If there are no +// diff layers, returns the disk layer root and false. +func (tree *layerTree) diffHead() (common.Hash, bool) { + tree.lock.RLock() + defer tree.lock.RUnlock() + + if _, ok := tree.layers[tree.head].(*diffLayer); ok { + return tree.head, true + } + return tree.base.rootHash(), false +} + // get retrieves a layer belonging to the given state root. func (tree *layerTree) get(root common.Hash) layer { tree.lock.RLock() diff --git a/triedb/pathdb/reader.go b/triedb/pathdb/reader.go index e087ef26ed..845667b578 100644 --- a/triedb/pathdb/reader.go +++ b/triedb/pathdb/reader.go @@ -69,7 +69,7 @@ func (r *reader) Node(owner common.Hash, path []byte, hash common.Hash) ([]byte, return nil, err } // Error out if the local one is inconsistent with the target. - if !r.noHashCheck && got != hash { + if !r.noHashCheck && (len(blob) > 0 && blob[0] != 0) && got != hash { // Location is always available even if the node // is not found. switch loc.loc {