From 7118cc2fc9c91b8cae1952146a332c852633087a Mon Sep 17 00:00:00 2001 From: knQzx <75641500+knQzx@users.noreply.github.com> Date: Sat, 4 Apr 2026 17:31:49 +0200 Subject: [PATCH] fix prometheus histogram count and sum to be cumulative resetting timers and resetting samples now track cumulative count and sum across snapshots so that prometheus _count and _sum metrics monotonically increase as required by the prometheus spec. the per-window values and percentiles still reset normally. --- metrics/prometheus/collector.go | 4 +- metrics/prometheus/collector_test.go | 44 +++++++++++++ metrics/resetting_sample.go | 20 ++++++ metrics/resetting_timer.go | 25 +++++++- metrics/resetting_timer_test.go | 94 ++++++++++++++++++++++++++++ 5 files changed, 184 insertions(+), 3 deletions(-) diff --git a/metrics/prometheus/collector.go b/metrics/prometheus/collector.go index 31b8c51b65..335991ccc3 100644 --- a/metrics/prometheus/collector.go +++ b/metrics/prometheus/collector.go @@ -122,12 +122,12 @@ func (c *collector) addTimer(name string, m *metrics.TimerSnapshot) { } func (c *collector) addResettingTimer(name string, m *metrics.ResettingTimerSnapshot) { - if m.Count() <= 0 { + if m.TotalCount() <= 0 { return } pv := []float64{0.5, 0.75, 0.95, 0.99, 0.999, 0.9999} ps := m.Percentiles(pv) - c.writeSummaryCounter(name, m.Count()) + c.writeSummaryCounter(name, m.TotalCount()) c.buff.WriteString(fmt.Sprintf(typeSummaryTpl, mutateKey(name))) for i := range pv { c.writeSummaryPercentile(name, strconv.FormatFloat(pv[i], 'f', -1, 64), ps[i]) diff --git a/metrics/prometheus/collector_test.go b/metrics/prometheus/collector_test.go index a8585d1226..b4fe2deeb8 100644 --- a/metrics/prometheus/collector_test.go +++ b/metrics/prometheus/collector_test.go @@ -21,6 +21,7 @@ import ( "os" "strings" "testing" + "time" "github.com/ethereum/go-ethereum/metrics" "github.com/ethereum/go-ethereum/metrics/internal" @@ -51,6 +52,49 @@ func TestCollector(t *testing.T) { } } +func TestResettingTimerCumulativePrometheus(t *testing.T) { + registry := metrics.NewRegistry() + timer := metrics.NewRegisteredResettingTimer("test/resetting", registry) + + // First batch of updates. + timer.Update(10 * time.Millisecond) + timer.Update(20 * time.Millisecond) + + // First scrape. + c1 := newCollector() + registry.Each(func(name string, i interface{}) { + c1.Add(name, i) + }) + out1 := c1.buff.String() + if !strings.Contains(out1, "test_resetting_count 2") { + t.Fatalf("first scrape should have count 2, got:\n%s", out1) + } + + // Second batch. + timer.Update(30 * time.Millisecond) + + // Second scrape - count should be cumulative (3, not 1). + c2 := newCollector() + registry.Each(func(name string, i interface{}) { + c2.Add(name, i) + }) + out2 := c2.buff.String() + if !strings.Contains(out2, "test_resetting_count 3") { + t.Fatalf("second scrape should have cumulative count 3, got:\n%s", out2) + } + + // Third scrape with no new updates - count should stay at 3. + c3 := newCollector() + registry.Each(func(name string, i interface{}) { + c3.Add(name, i) + }) + out3 := c3.buff.String() + // With no new events and totalCount > 0, we still need to report. + if !strings.Contains(out3, "test_resetting_count 3") { + t.Fatalf("third scrape should still report cumulative count 3, got:\n%s", out3) + } +} + func findFirstDiffPos(a, b string) string { yy := strings.Split(b, "\n") for i, x := range strings.Split(a, "\n") { diff --git a/metrics/resetting_sample.go b/metrics/resetting_sample.go index 730ef93416..cad93e36fe 100644 --- a/metrics/resetting_sample.go +++ b/metrics/resetting_sample.go @@ -1,5 +1,7 @@ package metrics +import "sync" + // ResettingSample converts an ordinary sample into one that resets whenever its // snapshot is retrieved. This will break for multi-monitor systems, but when only // a single metric is being pushed out, this ensure that low-frequency events don't @@ -14,11 +16,29 @@ func ResettingSample(sample Sample) Sample { // snapshot retrieval. type resettingSample struct { Sample + mutex sync.Mutex + totalCount int64 // cumulative count across all snapshots + totalSum int64 // cumulative sum across all snapshots } // Snapshot returns a read-only copy of the sample with the original reset. +// The returned snapshot has cumulative count and sum values that monotonically +// increase across resets, as required by the Prometheus counter convention. func (rs *resettingSample) Snapshot() *sampleSnapshot { + rs.mutex.Lock() + defer rs.mutex.Unlock() + s := rs.Sample.Snapshot() rs.Sample.Clear() + + // Accumulate cumulative totals from this snapshot's window. + rs.totalCount += s.count + rs.totalSum += s.sum + + // Override count and sum with cumulative values so that Prometheus + // _count and _sum are monotonically increasing counters. + s.count = rs.totalCount + s.sum = rs.totalSum + return s } diff --git a/metrics/resetting_timer.go b/metrics/resetting_timer.go index a3f46e52e0..b5ded7577e 100644 --- a/metrics/resetting_timer.go +++ b/metrics/resetting_timer.go @@ -36,6 +36,9 @@ type ResettingTimer struct { values []int64 sum int64 // sum is a running count of the total sum, used later to calculate mean + totalCount int64 // cumulative count across all snapshots + totalSum int64 // cumulative sum across all snapshots + mutex sync.Mutex } @@ -43,7 +46,15 @@ type ResettingTimer struct { func (t *ResettingTimer) Snapshot() *ResettingTimerSnapshot { t.mutex.Lock() defer t.mutex.Unlock() - snapshot := &ResettingTimerSnapshot{} + + // Accumulate cumulative totals before resetting. + t.totalCount += int64(len(t.values)) + t.totalSum += t.sum + + snapshot := &ResettingTimerSnapshot{ + totalCount: t.totalCount, + totalSum: t.totalSum, + } if len(t.values) > 0 { snapshot.mean = float64(t.sum) / float64(len(t.values)) snapshot.values = t.values @@ -84,6 +95,8 @@ type ResettingTimerSnapshot struct { min int64 thresholdBoundaries []float64 calculated bool + totalCount int64 // cumulative count across all snapshots + totalSum int64 // cumulative sum across all snapshots } // Count return the length of the values from snapshot. @@ -91,6 +104,16 @@ func (t *ResettingTimerSnapshot) Count() int { return len(t.values) } +// TotalCount returns the cumulative count of events across all snapshots. +func (t *ResettingTimerSnapshot) TotalCount() int64 { + return t.totalCount +} + +// TotalSum returns the cumulative sum of event durations across all snapshots. +func (t *ResettingTimerSnapshot) TotalSum() int64 { + return t.totalSum +} + // Percentiles returns the boundaries for the input percentiles. // note: this method is not thread safe func (t *ResettingTimerSnapshot) Percentiles(percentiles []float64) []float64 { diff --git a/metrics/resetting_timer_test.go b/metrics/resetting_timer_test.go index 4571fc8eb0..624a348366 100644 --- a/metrics/resetting_timer_test.go +++ b/metrics/resetting_timer_test.go @@ -5,6 +5,100 @@ import ( "time" ) +func TestResettingTimerCumulativeCountAndSum(t *testing.T) { + timer := NewResettingTimer() + + // First batch of updates. + timer.Update(10 * time.Millisecond) + timer.Update(20 * time.Millisecond) + timer.Update(30 * time.Millisecond) + + snap1 := timer.Snapshot() + if have, want := snap1.Count(), 3; have != want { + t.Fatalf("snap1 count: have %d, want %d", have, want) + } + if have, want := snap1.TotalCount(), int64(3); have != want { + t.Fatalf("snap1 total count: have %d, want %d", have, want) + } + wantSum1 := int64(10*time.Millisecond + 20*time.Millisecond + 30*time.Millisecond) + if have := snap1.TotalSum(); have != wantSum1 { + t.Fatalf("snap1 total sum: have %d, want %d", have, wantSum1) + } + + // Second batch of updates (after snapshot reset the values). + timer.Update(40 * time.Millisecond) + timer.Update(50 * time.Millisecond) + + snap2 := timer.Snapshot() + // Per-window count should be 2. + if have, want := snap2.Count(), 2; have != want { + t.Fatalf("snap2 count: have %d, want %d", have, want) + } + // Cumulative count should be 5 (3 + 2). + if have, want := snap2.TotalCount(), int64(5); have != want { + t.Fatalf("snap2 total count: have %d, want %d", have, want) + } + // Cumulative sum should include both batches. + wantSum2 := wantSum1 + int64(40*time.Millisecond+50*time.Millisecond) + if have := snap2.TotalSum(); have != wantSum2 { + t.Fatalf("snap2 total sum: have %d, want %d", have, wantSum2) + } + + // Empty snapshot should still report the same cumulative totals. + snap3 := timer.Snapshot() + if have, want := snap3.Count(), 0; have != want { + t.Fatalf("snap3 count: have %d, want %d", have, want) + } + if have, want := snap3.TotalCount(), int64(5); have != want { + t.Fatalf("snap3 total count: have %d, want %d", have, want) + } + if have := snap3.TotalSum(); have != wantSum2 { + t.Fatalf("snap3 total sum: have %d, want %d", have, wantSum2) + } +} + +func TestResettingSampleCumulativeCountAndSum(t *testing.T) { + Enable() + + s := ResettingSample(NewUniformSample(100)) + + // First batch. + s.Update(10) + s.Update(20) + s.Update(30) + + snap1 := s.Snapshot() + if have, want := snap1.Count(), int64(3); have != want { + t.Fatalf("snap1 count: have %d, want %d", have, want) + } + if have, want := snap1.Sum(), int64(60); have != want { + t.Fatalf("snap1 sum: have %d, want %d", have, want) + } + + // Second batch. + s.Update(40) + s.Update(50) + + snap2 := s.Snapshot() + // Count should be cumulative: 3 + 2 = 5. + if have, want := snap2.Count(), int64(5); have != want { + t.Fatalf("snap2 count: have %d, want %d", have, want) + } + // Sum should be cumulative: 60 + 90 = 150. + if have, want := snap2.Sum(), int64(150); have != want { + t.Fatalf("snap2 sum: have %d, want %d", have, want) + } + + // Empty snapshot should still report cumulative totals. + snap3 := s.Snapshot() + if have, want := snap3.Count(), int64(5); have != want { + t.Fatalf("snap3 count: have %d, want %d", have, want) + } + if have, want := snap3.Sum(), int64(150); have != want { + t.Fatalf("snap3 sum: have %d, want %d", have, want) + } +} + func TestResettingTimer(t *testing.T) { tests := []struct { values []int64