go-ethereum/triedb/pathdb/history_index_pruner.go
rjl493456442 db6c7d06a2
Some checks are pending
/ Linux Build (push) Waiting to run
/ Linux Build (arm) (push) Waiting to run
/ Keeper Build (push) Waiting to run
/ Windows Build (push) Waiting to run
/ Docker Image (push) Waiting to run
triedb/pathdb: implement history index pruner (#33999)
This PR implements the missing functionality for archive nodes by 
pruning stale index data.

The current mechanism is relatively simple but sufficient for now: 
it periodically iterates over index entries and deletes outdated data 
on a per-block basis. 

The pruning process is triggered every 90,000 new blocks (approximately 
every 12 days), and the iteration typically takes ~30 minutes on a 
mainnet node.

This mechanism is only applied with `gcmode=archive` enabled, having
no impact on normal full node.
2026-04-02 00:21:58 +02:00

385 lines
11 KiB
Go

// Copyright 2025 The go-ethereum Authors
// This file is part of the go-ethereum library.
//
// The go-ethereum library is free software: you can redistribute it and/or modify
// it under the terms of the GNU Lesser General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// The go-ethereum library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
package pathdb
import (
"encoding/binary"
"sync"
"sync/atomic"
"time"
"github.com/ethereum/go-ethereum/common"
"github.com/ethereum/go-ethereum/core/rawdb"
"github.com/ethereum/go-ethereum/ethdb"
"github.com/ethereum/go-ethereum/log"
)
const (
// indexPruningThreshold defines the number of pruned histories that must
// accumulate before triggering index pruning. This helps avoid scheduling
// index pruning too frequently.
indexPruningThreshold = 90000
// iteratorReopenInterval is how long the iterator is kept open before
// being released and re-opened. Long-lived iterators hold a read snapshot
// that blocks LSM compaction; periodically re-opening avoids stalling the
// compactor during a large scan.
iteratorReopenInterval = 30 * time.Second
)
// indexPruner is responsible for pruning stale index data from the tail side
// when old history objects are removed. It runs as a background goroutine and
// processes pruning signals whenever the history tail advances.
//
// The pruning operates at the block level: for each state element's index
// metadata, leading index blocks whose maximum history ID falls below the
// new tail are removed entirely. This avoids the need to decode individual
// block contents and is efficient because index blocks store monotonically
// increasing history IDs.
type indexPruner struct {
disk ethdb.KeyValueStore
typ historyType
tail atomic.Uint64 // Tail below which index entries can be pruned
lastRun uint64 // The tail in the last pruning run
trigger chan struct{} // Non-blocking signal that tail has advanced
closed chan struct{}
wg sync.WaitGroup
log log.Logger
pauseReq chan chan struct{} // Pause request; caller sends ack channel, pruner closes it when paused
resumeCh chan struct{} // Resume signal sent by caller after indexSingle/unindexSingle completes
}
// newIndexPruner creates and starts a new index pruner for the given history type.
func newIndexPruner(disk ethdb.KeyValueStore, typ historyType) *indexPruner {
p := &indexPruner{
disk: disk,
typ: typ,
trigger: make(chan struct{}, 1),
closed: make(chan struct{}),
log: log.New("type", typ.String()),
pauseReq: make(chan chan struct{}),
resumeCh: make(chan struct{}),
}
p.wg.Add(1)
go p.run()
return p
}
// prune signals the pruner that the history tail has advanced to the given ID.
// All index entries referencing history IDs below newTail can be removed.
func (p *indexPruner) prune(newTail uint64) {
// Only update if the tail is actually advancing
for {
old := p.tail.Load()
if newTail <= old {
return
}
if p.tail.CompareAndSwap(old, newTail) {
break
}
}
// Non-blocking signal
select {
case p.trigger <- struct{}{}:
default:
}
}
// pause requests the pruner to flush all pending writes and pause. It blocks
// until the pruner has acknowledged the pause. This must be paired with a
// subsequent call to resume.
func (p *indexPruner) pause() {
ack := make(chan struct{})
select {
case p.pauseReq <- ack:
<-ack // wait for the pruner to flush and acknowledge
case <-p.closed:
}
}
// resume unblocks a previously paused pruner, allowing it to continue
// processing.
func (p *indexPruner) resume() {
select {
case p.resumeCh <- struct{}{}:
case <-p.closed:
}
}
// close shuts down the pruner and waits for it to finish.
func (p *indexPruner) close() {
select {
case <-p.closed:
return
default:
close(p.closed)
p.wg.Wait()
}
}
// run is the main loop of the pruner. It waits for trigger signals and
// processes a small batch of entries on each trigger, advancing the cursor.
func (p *indexPruner) run() {
defer p.wg.Done()
for {
select {
case <-p.trigger:
tail := p.tail.Load()
if tail < p.lastRun || tail-p.lastRun < indexPruningThreshold {
continue
}
if err := p.process(tail); err != nil {
p.log.Error("Failed to prune index", "tail", tail, "err", err)
} else {
p.lastRun = tail
}
case ack := <-p.pauseReq:
// Pruner is idle, acknowledge immediately and wait for resume.
close(ack)
select {
case <-p.resumeCh:
case <-p.closed:
return
}
case <-p.closed:
return
}
}
}
// process iterates all index metadata entries for the history type and prunes
// leading blocks whose max history ID is below the given tail.
func (p *indexPruner) process(tail uint64) error {
var (
err error
pruned int
start = time.Now()
)
switch p.typ {
case typeStateHistory:
n, err := p.prunePrefix(rawdb.StateHistoryAccountMetadataPrefix, typeAccount, tail)
if err != nil {
return err
}
pruned += n
n, err = p.prunePrefix(rawdb.StateHistoryStorageMetadataPrefix, typeStorage, tail)
if err != nil {
return err
}
pruned += n
statePruneHistoryIndexTimer.UpdateSince(start)
case typeTrienodeHistory:
pruned, err = p.prunePrefix(rawdb.TrienodeHistoryMetadataPrefix, typeTrienode, tail)
if err != nil {
return err
}
trienodePruneHistoryIndexTimer.UpdateSince(start)
default:
panic("unknown history type")
}
if pruned > 0 {
p.log.Info("Pruned stale index blocks", "pruned", pruned, "tail", tail, "elapsed", common.PrettyDuration(time.Since(start)))
}
return nil
}
// prunePrefix scans all metadata entries under the given prefix and prunes
// leading index blocks below the tail. The iterator is periodically released
// and re-opened to avoid holding a read snapshot that blocks LSM compaction.
func (p *indexPruner) prunePrefix(prefix []byte, elemType elementType, tail uint64) (int, error) {
var (
pruned int
opened = time.Now()
it = p.disk.NewIterator(prefix, nil)
batch = p.disk.NewBatchWithSize(ethdb.IdealBatchSize)
)
for {
// Terminate if iterator is exhausted
if !it.Next() {
it.Release()
break
}
// Check termination or pause request
select {
case <-p.closed:
// Terminate the process if indexer is closed
it.Release()
if batch.ValueSize() > 0 {
return pruned, batch.Write()
}
return pruned, nil
case ack := <-p.pauseReq:
// Save the current position so that after resume the
// iterator can be re-opened from where it left off.
start := common.CopyBytes(it.Key()[len(prefix):])
it.Release()
// Flush all pending writes before acknowledging the pause.
var flushErr error
if batch.ValueSize() > 0 {
if err := batch.Write(); err != nil {
flushErr = err
}
batch.Reset()
}
close(ack)
// Block until resumed or closed. Always wait here even if
// the flush failed — returning early would cause resume()
// to deadlock since nobody would receive on resumeCh.
select {
case <-p.resumeCh:
if flushErr != nil {
return 0, flushErr
}
// Re-open the iterator from the saved position so the
// pruner sees the current database state (including any
// writes made by indexer during the pause).
it = p.disk.NewIterator(prefix, start)
opened = time.Now()
continue
case <-p.closed:
return pruned, flushErr
}
default:
// Keep processing
}
// Prune the index data block
key, value := it.Key(), it.Value()
ident, bsize := p.identFromKey(key, prefix, elemType)
n, err := p.pruneEntry(batch, ident, value, bsize, tail)
if err != nil {
p.log.Warn("Failed to prune index entry", "ident", ident, "err", err)
continue
}
pruned += n
// Flush the batch if there are too many accumulated
if batch.ValueSize() >= ethdb.IdealBatchSize {
if err := batch.Write(); err != nil {
it.Release()
return 0, err
}
batch.Reset()
}
// Periodically release the iterator so the LSM compactor
// is not blocked by the read snapshot we hold.
if time.Since(opened) >= iteratorReopenInterval {
opened = time.Now()
start := common.CopyBytes(it.Key()[len(prefix):])
it.Release()
it = p.disk.NewIterator(prefix, start)
}
}
if batch.ValueSize() > 0 {
if err := batch.Write(); err != nil {
return 0, err
}
}
return pruned, nil
}
// identFromKey reconstructs the stateIdent and bitmapSize from a metadata key.
func (p *indexPruner) identFromKey(key []byte, prefix []byte, elemType elementType) (stateIdent, int) {
rest := key[len(prefix):]
switch elemType {
case typeAccount:
// key = prefix + addressHash(32)
var addrHash common.Hash
copy(addrHash[:], rest[:32])
return newAccountIdent(addrHash), 0
case typeStorage:
// key = prefix + addressHash(32) + storageHash(32)
var addrHash, storHash common.Hash
copy(addrHash[:], rest[:32])
copy(storHash[:], rest[32:64])
return newStorageIdent(addrHash, storHash), 0
case typeTrienode:
// key = prefix + addressHash(32) + path(variable)
var addrHash common.Hash
copy(addrHash[:], rest[:32])
path := string(rest[32:])
ident := newTrienodeIdent(addrHash, path)
return ident, ident.bloomSize()
default:
panic("unknown element type")
}
}
// pruneEntry checks a single metadata entry and removes leading index blocks
// whose max < tail. Returns the number of blocks pruned.
func (p *indexPruner) pruneEntry(batch ethdb.Batch, ident stateIdent, blob []byte, bsize int, tail uint64) (int, error) {
// Fast path: the first 8 bytes of the metadata encode the max history ID
// of the first index block (big-endian uint64). If it is >= tail, no
// blocks can be pruned and we skip the full parse entirely.
if len(blob) >= 8 && binary.BigEndian.Uint64(blob[:8]) >= tail {
return 0, nil
}
descList, err := parseIndex(blob, bsize)
if err != nil {
return 0, err
}
// Find the number of leading blocks that can be entirely pruned.
// A block can be pruned if its max history ID is strictly below
// the tail.
var count int
for _, desc := range descList {
if desc.max < tail {
count++
} else {
break // blocks are ordered, no more to prune
}
}
if count == 0 {
return 0, nil
}
// Delete the pruned index blocks
for i := 0; i < count; i++ {
deleteStateIndexBlock(ident, batch, descList[i].id)
}
// Update or delete the metadata
remaining := descList[count:]
if len(remaining) == 0 {
// All blocks pruned, remove the metadata entry entirely
deleteStateIndex(ident, batch)
} else {
// Rewrite the metadata with the remaining blocks
size := indexBlockDescSize + bsize
buf := make([]byte, 0, size*len(remaining))
for _, desc := range remaining {
buf = append(buf, desc.encode()...)
}
writeStateIndex(ident, batch, buf)
}
return count, nil
}