mirror of
https://github.com/ethereum/go-ethereum.git
synced 2026-04-06 09:57:29 +00:00
This PR implements the missing functionality for archive nodes by pruning stale index data. The current mechanism is relatively simple but sufficient for now: it periodically iterates over index entries and deletes outdated data on a per-block basis. The pruning process is triggered every 90,000 new blocks (approximately every 12 days), and the iteration typically takes ~30 minutes on a mainnet node. This mechanism is only applied with `gcmode=archive` enabled, having no impact on normal full node.
385 lines
11 KiB
Go
385 lines
11 KiB
Go
// Copyright 2025 The go-ethereum Authors
|
|
// This file is part of the go-ethereum library.
|
|
//
|
|
// The go-ethereum library is free software: you can redistribute it and/or modify
|
|
// it under the terms of the GNU Lesser General Public License as published by
|
|
// the Free Software Foundation, either version 3 of the License, or
|
|
// (at your option) any later version.
|
|
//
|
|
// The go-ethereum library is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU Lesser General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU Lesser General Public License
|
|
// along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
package pathdb
|
|
|
|
import (
|
|
"encoding/binary"
|
|
"sync"
|
|
"sync/atomic"
|
|
"time"
|
|
|
|
"github.com/ethereum/go-ethereum/common"
|
|
"github.com/ethereum/go-ethereum/core/rawdb"
|
|
"github.com/ethereum/go-ethereum/ethdb"
|
|
"github.com/ethereum/go-ethereum/log"
|
|
)
|
|
|
|
const (
|
|
// indexPruningThreshold defines the number of pruned histories that must
|
|
// accumulate before triggering index pruning. This helps avoid scheduling
|
|
// index pruning too frequently.
|
|
indexPruningThreshold = 90000
|
|
|
|
// iteratorReopenInterval is how long the iterator is kept open before
|
|
// being released and re-opened. Long-lived iterators hold a read snapshot
|
|
// that blocks LSM compaction; periodically re-opening avoids stalling the
|
|
// compactor during a large scan.
|
|
iteratorReopenInterval = 30 * time.Second
|
|
)
|
|
|
|
// indexPruner is responsible for pruning stale index data from the tail side
|
|
// when old history objects are removed. It runs as a background goroutine and
|
|
// processes pruning signals whenever the history tail advances.
|
|
//
|
|
// The pruning operates at the block level: for each state element's index
|
|
// metadata, leading index blocks whose maximum history ID falls below the
|
|
// new tail are removed entirely. This avoids the need to decode individual
|
|
// block contents and is efficient because index blocks store monotonically
|
|
// increasing history IDs.
|
|
type indexPruner struct {
|
|
disk ethdb.KeyValueStore
|
|
typ historyType
|
|
tail atomic.Uint64 // Tail below which index entries can be pruned
|
|
lastRun uint64 // The tail in the last pruning run
|
|
trigger chan struct{} // Non-blocking signal that tail has advanced
|
|
closed chan struct{}
|
|
wg sync.WaitGroup
|
|
log log.Logger
|
|
|
|
pauseReq chan chan struct{} // Pause request; caller sends ack channel, pruner closes it when paused
|
|
resumeCh chan struct{} // Resume signal sent by caller after indexSingle/unindexSingle completes
|
|
}
|
|
|
|
// newIndexPruner creates and starts a new index pruner for the given history type.
|
|
func newIndexPruner(disk ethdb.KeyValueStore, typ historyType) *indexPruner {
|
|
p := &indexPruner{
|
|
disk: disk,
|
|
typ: typ,
|
|
trigger: make(chan struct{}, 1),
|
|
closed: make(chan struct{}),
|
|
log: log.New("type", typ.String()),
|
|
pauseReq: make(chan chan struct{}),
|
|
resumeCh: make(chan struct{}),
|
|
}
|
|
p.wg.Add(1)
|
|
go p.run()
|
|
return p
|
|
}
|
|
|
|
// prune signals the pruner that the history tail has advanced to the given ID.
|
|
// All index entries referencing history IDs below newTail can be removed.
|
|
func (p *indexPruner) prune(newTail uint64) {
|
|
// Only update if the tail is actually advancing
|
|
for {
|
|
old := p.tail.Load()
|
|
if newTail <= old {
|
|
return
|
|
}
|
|
if p.tail.CompareAndSwap(old, newTail) {
|
|
break
|
|
}
|
|
}
|
|
// Non-blocking signal
|
|
select {
|
|
case p.trigger <- struct{}{}:
|
|
default:
|
|
}
|
|
}
|
|
|
|
// pause requests the pruner to flush all pending writes and pause. It blocks
|
|
// until the pruner has acknowledged the pause. This must be paired with a
|
|
// subsequent call to resume.
|
|
func (p *indexPruner) pause() {
|
|
ack := make(chan struct{})
|
|
select {
|
|
case p.pauseReq <- ack:
|
|
<-ack // wait for the pruner to flush and acknowledge
|
|
case <-p.closed:
|
|
}
|
|
}
|
|
|
|
// resume unblocks a previously paused pruner, allowing it to continue
|
|
// processing.
|
|
func (p *indexPruner) resume() {
|
|
select {
|
|
case p.resumeCh <- struct{}{}:
|
|
case <-p.closed:
|
|
}
|
|
}
|
|
|
|
// close shuts down the pruner and waits for it to finish.
|
|
func (p *indexPruner) close() {
|
|
select {
|
|
case <-p.closed:
|
|
return
|
|
default:
|
|
close(p.closed)
|
|
p.wg.Wait()
|
|
}
|
|
}
|
|
|
|
// run is the main loop of the pruner. It waits for trigger signals and
|
|
// processes a small batch of entries on each trigger, advancing the cursor.
|
|
func (p *indexPruner) run() {
|
|
defer p.wg.Done()
|
|
|
|
for {
|
|
select {
|
|
case <-p.trigger:
|
|
tail := p.tail.Load()
|
|
if tail < p.lastRun || tail-p.lastRun < indexPruningThreshold {
|
|
continue
|
|
}
|
|
if err := p.process(tail); err != nil {
|
|
p.log.Error("Failed to prune index", "tail", tail, "err", err)
|
|
} else {
|
|
p.lastRun = tail
|
|
}
|
|
|
|
case ack := <-p.pauseReq:
|
|
// Pruner is idle, acknowledge immediately and wait for resume.
|
|
close(ack)
|
|
select {
|
|
case <-p.resumeCh:
|
|
case <-p.closed:
|
|
return
|
|
}
|
|
|
|
case <-p.closed:
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// process iterates all index metadata entries for the history type and prunes
|
|
// leading blocks whose max history ID is below the given tail.
|
|
func (p *indexPruner) process(tail uint64) error {
|
|
var (
|
|
err error
|
|
pruned int
|
|
start = time.Now()
|
|
)
|
|
switch p.typ {
|
|
case typeStateHistory:
|
|
n, err := p.prunePrefix(rawdb.StateHistoryAccountMetadataPrefix, typeAccount, tail)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
pruned += n
|
|
|
|
n, err = p.prunePrefix(rawdb.StateHistoryStorageMetadataPrefix, typeStorage, tail)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
pruned += n
|
|
statePruneHistoryIndexTimer.UpdateSince(start)
|
|
|
|
case typeTrienodeHistory:
|
|
pruned, err = p.prunePrefix(rawdb.TrienodeHistoryMetadataPrefix, typeTrienode, tail)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
trienodePruneHistoryIndexTimer.UpdateSince(start)
|
|
|
|
default:
|
|
panic("unknown history type")
|
|
}
|
|
if pruned > 0 {
|
|
p.log.Info("Pruned stale index blocks", "pruned", pruned, "tail", tail, "elapsed", common.PrettyDuration(time.Since(start)))
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// prunePrefix scans all metadata entries under the given prefix and prunes
|
|
// leading index blocks below the tail. The iterator is periodically released
|
|
// and re-opened to avoid holding a read snapshot that blocks LSM compaction.
|
|
func (p *indexPruner) prunePrefix(prefix []byte, elemType elementType, tail uint64) (int, error) {
|
|
var (
|
|
pruned int
|
|
opened = time.Now()
|
|
it = p.disk.NewIterator(prefix, nil)
|
|
batch = p.disk.NewBatchWithSize(ethdb.IdealBatchSize)
|
|
)
|
|
for {
|
|
// Terminate if iterator is exhausted
|
|
if !it.Next() {
|
|
it.Release()
|
|
break
|
|
}
|
|
// Check termination or pause request
|
|
select {
|
|
case <-p.closed:
|
|
// Terminate the process if indexer is closed
|
|
it.Release()
|
|
if batch.ValueSize() > 0 {
|
|
return pruned, batch.Write()
|
|
}
|
|
return pruned, nil
|
|
|
|
case ack := <-p.pauseReq:
|
|
// Save the current position so that after resume the
|
|
// iterator can be re-opened from where it left off.
|
|
start := common.CopyBytes(it.Key()[len(prefix):])
|
|
it.Release()
|
|
|
|
// Flush all pending writes before acknowledging the pause.
|
|
var flushErr error
|
|
if batch.ValueSize() > 0 {
|
|
if err := batch.Write(); err != nil {
|
|
flushErr = err
|
|
}
|
|
batch.Reset()
|
|
}
|
|
close(ack)
|
|
|
|
// Block until resumed or closed. Always wait here even if
|
|
// the flush failed — returning early would cause resume()
|
|
// to deadlock since nobody would receive on resumeCh.
|
|
select {
|
|
case <-p.resumeCh:
|
|
if flushErr != nil {
|
|
return 0, flushErr
|
|
}
|
|
// Re-open the iterator from the saved position so the
|
|
// pruner sees the current database state (including any
|
|
// writes made by indexer during the pause).
|
|
it = p.disk.NewIterator(prefix, start)
|
|
opened = time.Now()
|
|
continue
|
|
case <-p.closed:
|
|
return pruned, flushErr
|
|
}
|
|
|
|
default:
|
|
// Keep processing
|
|
}
|
|
|
|
// Prune the index data block
|
|
key, value := it.Key(), it.Value()
|
|
ident, bsize := p.identFromKey(key, prefix, elemType)
|
|
n, err := p.pruneEntry(batch, ident, value, bsize, tail)
|
|
if err != nil {
|
|
p.log.Warn("Failed to prune index entry", "ident", ident, "err", err)
|
|
continue
|
|
}
|
|
pruned += n
|
|
|
|
// Flush the batch if there are too many accumulated
|
|
if batch.ValueSize() >= ethdb.IdealBatchSize {
|
|
if err := batch.Write(); err != nil {
|
|
it.Release()
|
|
return 0, err
|
|
}
|
|
batch.Reset()
|
|
}
|
|
|
|
// Periodically release the iterator so the LSM compactor
|
|
// is not blocked by the read snapshot we hold.
|
|
if time.Since(opened) >= iteratorReopenInterval {
|
|
opened = time.Now()
|
|
|
|
start := common.CopyBytes(it.Key()[len(prefix):])
|
|
it.Release()
|
|
it = p.disk.NewIterator(prefix, start)
|
|
}
|
|
}
|
|
if batch.ValueSize() > 0 {
|
|
if err := batch.Write(); err != nil {
|
|
return 0, err
|
|
}
|
|
}
|
|
return pruned, nil
|
|
}
|
|
|
|
// identFromKey reconstructs the stateIdent and bitmapSize from a metadata key.
|
|
func (p *indexPruner) identFromKey(key []byte, prefix []byte, elemType elementType) (stateIdent, int) {
|
|
rest := key[len(prefix):]
|
|
|
|
switch elemType {
|
|
case typeAccount:
|
|
// key = prefix + addressHash(32)
|
|
var addrHash common.Hash
|
|
copy(addrHash[:], rest[:32])
|
|
return newAccountIdent(addrHash), 0
|
|
|
|
case typeStorage:
|
|
// key = prefix + addressHash(32) + storageHash(32)
|
|
var addrHash, storHash common.Hash
|
|
copy(addrHash[:], rest[:32])
|
|
copy(storHash[:], rest[32:64])
|
|
return newStorageIdent(addrHash, storHash), 0
|
|
|
|
case typeTrienode:
|
|
// key = prefix + addressHash(32) + path(variable)
|
|
var addrHash common.Hash
|
|
copy(addrHash[:], rest[:32])
|
|
path := string(rest[32:])
|
|
ident := newTrienodeIdent(addrHash, path)
|
|
return ident, ident.bloomSize()
|
|
|
|
default:
|
|
panic("unknown element type")
|
|
}
|
|
}
|
|
|
|
// pruneEntry checks a single metadata entry and removes leading index blocks
|
|
// whose max < tail. Returns the number of blocks pruned.
|
|
func (p *indexPruner) pruneEntry(batch ethdb.Batch, ident stateIdent, blob []byte, bsize int, tail uint64) (int, error) {
|
|
// Fast path: the first 8 bytes of the metadata encode the max history ID
|
|
// of the first index block (big-endian uint64). If it is >= tail, no
|
|
// blocks can be pruned and we skip the full parse entirely.
|
|
if len(blob) >= 8 && binary.BigEndian.Uint64(blob[:8]) >= tail {
|
|
return 0, nil
|
|
}
|
|
descList, err := parseIndex(blob, bsize)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
// Find the number of leading blocks that can be entirely pruned.
|
|
// A block can be pruned if its max history ID is strictly below
|
|
// the tail.
|
|
var count int
|
|
for _, desc := range descList {
|
|
if desc.max < tail {
|
|
count++
|
|
} else {
|
|
break // blocks are ordered, no more to prune
|
|
}
|
|
}
|
|
if count == 0 {
|
|
return 0, nil
|
|
}
|
|
// Delete the pruned index blocks
|
|
for i := 0; i < count; i++ {
|
|
deleteStateIndexBlock(ident, batch, descList[i].id)
|
|
}
|
|
// Update or delete the metadata
|
|
remaining := descList[count:]
|
|
if len(remaining) == 0 {
|
|
// All blocks pruned, remove the metadata entry entirely
|
|
deleteStateIndex(ident, batch)
|
|
} else {
|
|
// Rewrite the metadata with the remaining blocks
|
|
size := indexBlockDescSize + bsize
|
|
buf := make([]byte, 0, size*len(remaining))
|
|
for _, desc := range remaining {
|
|
buf = append(buf, desc.encode()...)
|
|
}
|
|
writeStateIndex(ident, batch, buf)
|
|
}
|
|
return count, nil
|
|
}
|