eth/protocols/snap: add healing and syncing metrics (#32258)
Some checks failed
/ Linux Build (push) Has been cancelled
/ Linux Build (arm) (push) Has been cancelled
/ Windows Build (push) Has been cancelled
/ Docker Image (push) Has been cancelled

Adds the heal time and snap sync time to grafana

---------

Co-authored-by: Gary Rong <garyrong0905@gmail.com>
This commit is contained in:
Marius van der Wijden 2025-07-24 10:43:04 +02:00 committed by GitHub
parent 16117eb7cd
commit b369a855fb
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 26 additions and 3 deletions

View file

@ -66,4 +66,7 @@ var (
// discarded during the snap sync.
largeStorageDiscardGauge = metrics.NewRegisteredGauge("eth/protocols/snap/sync/storage/chunk/discard", nil)
largeStorageResumedGauge = metrics.NewRegisteredGauge("eth/protocols/snap/sync/storage/chunk/resume", nil)
stateSyncTimeGauge = metrics.NewRegisteredGauge("eth/protocols/snap/sync/time/statesync", nil)
stateHealTimeGauge = metrics.NewRegisteredGauge("eth/protocols/snap/sync/time/stateheal", nil)
)

View file

@ -502,8 +502,10 @@ type Syncer struct {
storageHealed uint64 // Number of storage slots downloaded during the healing stage
storageHealedBytes common.StorageSize // Number of raw storage bytes persisted to disk during the healing stage
startTime time.Time // Time instance when snapshot sync started
logTime time.Time // Time instance when status was last reported
startTime time.Time // Time instance when snapshot sync started
healStartTime time.Time // Time instance when the state healing started
syncTimeOnce sync.Once // Ensure that the state sync time is uploaded only once
logTime time.Time // Time instance when status was last reported
pend sync.WaitGroup // Tracks network request goroutines for graceful shutdown
lock sync.RWMutex // Protects fields that can change outside of sync (peers, reqs, root)
@ -685,6 +687,14 @@ func (s *Syncer) Sync(root common.Hash, cancel chan struct{}) error {
s.cleanStorageTasks()
s.cleanAccountTasks()
if len(s.tasks) == 0 && s.healer.scheduler.Pending() == 0 {
// State healing phase completed, record the elapsed time in metrics.
// Note: healing may be rerun in subsequent cycles to fill gaps between
// pivot states (e.g., if chain sync takes longer).
if !s.healStartTime.IsZero() {
stateHealTimeGauge.Inc(int64(time.Since(s.healStartTime)))
log.Info("State healing phase is completed", "elapsed", common.PrettyDuration(time.Since(s.healStartTime)))
s.healStartTime = time.Time{}
}
return nil
}
// Assign all the data retrieval tasks to any free peers
@ -693,7 +703,17 @@ func (s *Syncer) Sync(root common.Hash, cancel chan struct{}) error {
s.assignStorageTasks(storageResps, storageReqFails, cancel)
if len(s.tasks) == 0 {
// Sync phase done, run heal phase
// State sync phase completed, record the elapsed time in metrics.
// Note: the initial state sync runs only once, regardless of whether
// a new cycle is started later. Any state differences in subsequent
// cycles will be handled by the state healer.
s.syncTimeOnce.Do(func() {
stateSyncTimeGauge.Update(int64(time.Since(s.startTime)))
log.Info("State sync phase is completed", "elapsed", common.PrettyDuration(time.Since(s.startTime)))
})
if s.healStartTime.IsZero() {
s.healStartTime = time.Now()
}
s.assignTrienodeHealTasks(trienodeHealResps, trienodeHealReqFails, cancel)
s.assignBytecodeHealTasks(bytecodeHealResps, bytecodeHealReqFails, cancel)
}