From 7118cc2fc9c91b8cae1952146a332c852633087a Mon Sep 17 00:00:00 2001
From: knQzx <75641500+knQzx@users.noreply.github.com>
Date: Sat, 4 Apr 2026 17:31:49 +0200
Subject: [PATCH] fix prometheus histogram count and sum to be cumulative

resetting timers and resetting samples now track cumulative count and
sum across snapshots so that prometheus _count and _sum metrics
monotonically increase as required by the prometheus spec. the per-window
values and percentiles still reset normally.
---
 metrics/prometheus/collector.go      |  4 +-
 metrics/prometheus/collector_test.go | 44 +++++++++++++
 metrics/resetting_sample.go          | 20 ++++++
 metrics/resetting_timer.go           | 25 +++++++-
 metrics/resetting_timer_test.go      | 94 ++++++++++++++++++++++++++++
 5 files changed, 184 insertions(+), 3 deletions(-)

diff --git a/metrics/prometheus/collector.go b/metrics/prometheus/collector.go
index 31b8c51b65..335991ccc3 100644
--- a/metrics/prometheus/collector.go
+++ b/metrics/prometheus/collector.go
@@ -122,12 +122,12 @@ func (c *collector) addTimer(name string, m *metrics.TimerSnapshot) {
 }
 
 func (c *collector) addResettingTimer(name string, m *metrics.ResettingTimerSnapshot) {
-	if m.Count() <= 0 {
+	if m.TotalCount() <= 0 {
 		return
 	}
 	pv := []float64{0.5, 0.75, 0.95, 0.99, 0.999, 0.9999}
 	ps := m.Percentiles(pv)
-	c.writeSummaryCounter(name, m.Count())
+	c.writeSummaryCounter(name, m.TotalCount())
 	c.buff.WriteString(fmt.Sprintf(typeSummaryTpl, mutateKey(name)))
 	for i := range pv {
 		c.writeSummaryPercentile(name, strconv.FormatFloat(pv[i], 'f', -1, 64), ps[i])
diff --git a/metrics/prometheus/collector_test.go b/metrics/prometheus/collector_test.go
index a8585d1226..b4fe2deeb8 100644
--- a/metrics/prometheus/collector_test.go
+++ b/metrics/prometheus/collector_test.go
@@ -21,6 +21,7 @@ import (
 	"os"
 	"strings"
 	"testing"
+	"time"
 
 	"github.com/ethereum/go-ethereum/metrics"
 	"github.com/ethereum/go-ethereum/metrics/internal"
@@ -51,6 +52,49 @@ func TestCollector(t *testing.T) {
 	}
 }
 
+func TestResettingTimerCumulativePrometheus(t *testing.T) {
+	registry := metrics.NewRegistry()
+	timer := metrics.NewRegisteredResettingTimer("test/resetting", registry)
+
+	// First batch of updates.
+	timer.Update(10 * time.Millisecond)
+	timer.Update(20 * time.Millisecond)
+
+	// First scrape.
+	c1 := newCollector()
+	registry.Each(func(name string, i interface{}) {
+		c1.Add(name, i)
+	})
+	out1 := c1.buff.String()
+	if !strings.Contains(out1, "test_resetting_count 2") {
+		t.Fatalf("first scrape should have count 2, got:\n%s", out1)
+	}
+
+	// Second batch.
+	timer.Update(30 * time.Millisecond)
+
+	// Second scrape - count should be cumulative (3, not 1).
+	c2 := newCollector()
+	registry.Each(func(name string, i interface{}) {
+		c2.Add(name, i)
+	})
+	out2 := c2.buff.String()
+	if !strings.Contains(out2, "test_resetting_count 3") {
+		t.Fatalf("second scrape should have cumulative count 3, got:\n%s", out2)
+	}
+
+	// Third scrape with no new updates - count should stay at 3.
+	c3 := newCollector()
+	registry.Each(func(name string, i interface{}) {
+		c3.Add(name, i)
+	})
+	out3 := c3.buff.String()
+	// With no new events and totalCount > 0, we still need to report.
+	if !strings.Contains(out3, "test_resetting_count 3") {
+		t.Fatalf("third scrape should still report cumulative count 3, got:\n%s", out3)
+	}
+}
+
 func findFirstDiffPos(a, b string) string {
 	yy := strings.Split(b, "\n")
 	for i, x := range strings.Split(a, "\n") {
diff --git a/metrics/resetting_sample.go b/metrics/resetting_sample.go
index 730ef93416..cad93e36fe 100644
--- a/metrics/resetting_sample.go
+++ b/metrics/resetting_sample.go
@@ -1,5 +1,7 @@
 package metrics
 
+import "sync"
+
 // ResettingSample converts an ordinary sample into one that resets whenever its
 // snapshot is retrieved. This will break for multi-monitor systems, but when only
 // a single metric is being pushed out, this ensure that low-frequency events don't
@@ -14,11 +16,29 @@ func ResettingSample(sample Sample) Sample {
 // snapshot retrieval.
 type resettingSample struct {
 	Sample
+	mutex      sync.Mutex
+	totalCount int64 // cumulative count across all snapshots
+	totalSum   int64 // cumulative sum across all snapshots
 }
 
 // Snapshot returns a read-only copy of the sample with the original reset.
+// The returned snapshot has cumulative count and sum values that monotonically
+// increase across resets, as required by the Prometheus counter convention.
 func (rs *resettingSample) Snapshot() *sampleSnapshot {
+	rs.mutex.Lock()
+	defer rs.mutex.Unlock()
+
 	s := rs.Sample.Snapshot()
 	rs.Sample.Clear()
+
+	// Accumulate cumulative totals from this snapshot's window.
+	rs.totalCount += s.count
+	rs.totalSum += s.sum
+
+	// Override count and sum with cumulative values so that Prometheus
+	// _count and _sum are monotonically increasing counters.
+	s.count = rs.totalCount
+	s.sum = rs.totalSum
+
 	return s
 }
diff --git a/metrics/resetting_timer.go b/metrics/resetting_timer.go
index a3f46e52e0..b5ded7577e 100644
--- a/metrics/resetting_timer.go
+++ b/metrics/resetting_timer.go
@@ -36,6 +36,9 @@ type ResettingTimer struct {
 	values []int64
 	sum    int64 // sum is a running count of the total sum, used later to calculate mean
 
+	totalCount int64 // cumulative count across all snapshots
+	totalSum   int64 // cumulative sum across all snapshots
+
 	mutex sync.Mutex
 }
 
@@ -43,7 +46,15 @@ type ResettingTimer struct {
 func (t *ResettingTimer) Snapshot() *ResettingTimerSnapshot {
 	t.mutex.Lock()
 	defer t.mutex.Unlock()
-	snapshot := &ResettingTimerSnapshot{}
+
+	// Accumulate cumulative totals before resetting.
+	t.totalCount += int64(len(t.values))
+	t.totalSum += t.sum
+
+	snapshot := &ResettingTimerSnapshot{
+		totalCount: t.totalCount,
+		totalSum:   t.totalSum,
+	}
 	if len(t.values) > 0 {
 		snapshot.mean = float64(t.sum) / float64(len(t.values))
 		snapshot.values = t.values
@@ -84,6 +95,8 @@ type ResettingTimerSnapshot struct {
 	min                 int64
 	thresholdBoundaries []float64
 	calculated          bool
+	totalCount          int64 // cumulative count across all snapshots
+	totalSum            int64 // cumulative sum across all snapshots
 }
 
 // Count return the length of the values from snapshot.
@@ -91,6 +104,16 @@ func (t *ResettingTimerSnapshot) Count() int {
 	return len(t.values)
 }
 
+// TotalCount returns the cumulative count of events across all snapshots.
+func (t *ResettingTimerSnapshot) TotalCount() int64 {
+	return t.totalCount
+}
+
+// TotalSum returns the cumulative sum of event durations across all snapshots.
+func (t *ResettingTimerSnapshot) TotalSum() int64 {
+	return t.totalSum
+}
+
 // Percentiles returns the boundaries for the input percentiles.
 // note: this method is not thread safe
 func (t *ResettingTimerSnapshot) Percentiles(percentiles []float64) []float64 {
diff --git a/metrics/resetting_timer_test.go b/metrics/resetting_timer_test.go
index 4571fc8eb0..624a348366 100644
--- a/metrics/resetting_timer_test.go
+++ b/metrics/resetting_timer_test.go
@@ -5,6 +5,100 @@ import (
 	"time"
 )
 
+func TestResettingTimerCumulativeCountAndSum(t *testing.T) {
+	timer := NewResettingTimer()
+
+	// First batch of updates.
+	timer.Update(10 * time.Millisecond)
+	timer.Update(20 * time.Millisecond)
+	timer.Update(30 * time.Millisecond)
+
+	snap1 := timer.Snapshot()
+	if have, want := snap1.Count(), 3; have != want {
+		t.Fatalf("snap1 count: have %d, want %d", have, want)
+	}
+	if have, want := snap1.TotalCount(), int64(3); have != want {
+		t.Fatalf("snap1 total count: have %d, want %d", have, want)
+	}
+	wantSum1 := int64(10*time.Millisecond + 20*time.Millisecond + 30*time.Millisecond)
+	if have := snap1.TotalSum(); have != wantSum1 {
+		t.Fatalf("snap1 total sum: have %d, want %d", have, wantSum1)
+	}
+
+	// Second batch of updates (after snapshot reset the values).
+	timer.Update(40 * time.Millisecond)
+	timer.Update(50 * time.Millisecond)
+
+	snap2 := timer.Snapshot()
+	// Per-window count should be 2.
+	if have, want := snap2.Count(), 2; have != want {
+		t.Fatalf("snap2 count: have %d, want %d", have, want)
+	}
+	// Cumulative count should be 5 (3 + 2).
+	if have, want := snap2.TotalCount(), int64(5); have != want {
+		t.Fatalf("snap2 total count: have %d, want %d", have, want)
+	}
+	// Cumulative sum should include both batches.
+	wantSum2 := wantSum1 + int64(40*time.Millisecond+50*time.Millisecond)
+	if have := snap2.TotalSum(); have != wantSum2 {
+		t.Fatalf("snap2 total sum: have %d, want %d", have, wantSum2)
+	}
+
+	// Empty snapshot should still report the same cumulative totals.
+	snap3 := timer.Snapshot()
+	if have, want := snap3.Count(), 0; have != want {
+		t.Fatalf("snap3 count: have %d, want %d", have, want)
+	}
+	if have, want := snap3.TotalCount(), int64(5); have != want {
+		t.Fatalf("snap3 total count: have %d, want %d", have, want)
+	}
+	if have := snap3.TotalSum(); have != wantSum2 {
+		t.Fatalf("snap3 total sum: have %d, want %d", have, wantSum2)
+	}
+}
+
+func TestResettingSampleCumulativeCountAndSum(t *testing.T) {
+	Enable()
+
+	s := ResettingSample(NewUniformSample(100))
+
+	// First batch.
+	s.Update(10)
+	s.Update(20)
+	s.Update(30)
+
+	snap1 := s.Snapshot()
+	if have, want := snap1.Count(), int64(3); have != want {
+		t.Fatalf("snap1 count: have %d, want %d", have, want)
+	}
+	if have, want := snap1.Sum(), int64(60); have != want {
+		t.Fatalf("snap1 sum: have %d, want %d", have, want)
+	}
+
+	// Second batch.
+	s.Update(40)
+	s.Update(50)
+
+	snap2 := s.Snapshot()
+	// Count should be cumulative: 3 + 2 = 5.
+	if have, want := snap2.Count(), int64(5); have != want {
+		t.Fatalf("snap2 count: have %d, want %d", have, want)
+	}
+	// Sum should be cumulative: 60 + 90 = 150.
+	if have, want := snap2.Sum(), int64(150); have != want {
+		t.Fatalf("snap2 sum: have %d, want %d", have, want)
+	}
+
+	// Empty snapshot should still report cumulative totals.
+	snap3 := s.Snapshot()
+	if have, want := snap3.Count(), int64(5); have != want {
+		t.Fatalf("snap3 count: have %d, want %d", have, want)
+	}
+	if have, want := snap3.Sum(), int64(150); have != want {
+		t.Fatalf("snap3 sum: have %d, want %d", have, want)
+	}
+}
+
 func TestResettingTimer(t *testing.T) {
 	tests := []struct {
 		values   []int64