From 6702bfb432f6e5fb49dfd3b12896ebbf289faecb Mon Sep 17 00:00:00 2001 From: Moshe Malawach Date: Tue, 21 Apr 2026 21:39:48 +0200 Subject: [PATCH] p2p/discover: fix data race on revalidationList fields Table.loop calls (*tableRevalidation).run without holding tab.mutex, while doRefresh -> loadSeedNodes -> handleAddNode reaches the same tableRevalidation state (nodeAdded -> list.push -> list.schedule) in a separate goroutine under tab.mutex. The two paths race on revalidationList.nextTime and list.nodes: the loop goroutine reads nextTime in run, schedules (writes it) and iterates list.nodes via list.get; the refresh goroutine appends to list.nodes and, on the first push, writes nextTime. Acquire tab.mutex for the full duration of run and document that startRequest must be called with the lock held. Remove the now-redundant internal lock/unlock pair in startRequest. Add a regression test that triggers the race deterministically under 'go test -race'. Closes #31460. --- p2p/discover/table_reval.go | 19 ++++++---- p2p/discover/table_reval_test.go | 60 ++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+), 7 deletions(-) diff --git a/p2p/discover/table_reval.go b/p2p/discover/table_reval.go index 1519313d19..d19ce30298 100644 --- a/p2p/discover/table_reval.go +++ b/p2p/discover/table_reval.go @@ -76,8 +76,16 @@ func (tr *tableRevalidation) nodeEndpointChanged(tab *Table, n *tableNode) { // run performs node revalidation. // It returns the next time it should be invoked, which is used in the Table main loop // to schedule a timer. However, run can be called at any time. +// +// run acquires tab.mutex to synchronize with node additions performed from +// the doRefresh background goroutine, which reach this file through +// tab.handleAddNode -> tr.nodeAdded -> list.push -> list.schedule and therefore +// mutate the same revalidationList fields that run reads. func (tr *tableRevalidation) run(tab *Table, now mclock.AbsTime) (nextTime mclock.AbsTime) { - reval := func(list *revalidationList) { + tab.mutex.Lock() + defer tab.mutex.Unlock() + + runList := func(list *revalidationList) { if list.nextTime <= now { if n := list.get(&tab.rand, tr.activeReq); n != nil { tr.startRequest(tab, n) @@ -87,24 +95,21 @@ func (tr *tableRevalidation) run(tab *Table, now mclock.AbsTime) (nextTime mcloc list.schedule(now, &tab.rand) } } - reval(&tr.fast) - reval(&tr.slow) + runList(&tr.fast) + runList(&tr.slow) return min(tr.fast.nextTime, tr.slow.nextTime) } // startRequest spawns a revalidation request for node n. +// The caller must hold tab.mutex. func (tr *tableRevalidation) startRequest(tab *Table, n *tableNode) { if _, ok := tr.activeReq[n.ID()]; ok { panic(fmt.Errorf("duplicate startRequest (node %v)", n.ID())) } tr.activeReq[n.ID()] = struct{}{} resp := revalidationResponse{n: n} - - // Fetch the node while holding lock. - tab.mutex.Lock() node := n.Node - tab.mutex.Unlock() go tab.doRevalidate(resp, node) } diff --git a/p2p/discover/table_reval_test.go b/p2p/discover/table_reval_test.go index 3605443934..24d2826d05 100644 --- a/p2p/discover/table_reval_test.go +++ b/p2p/discover/table_reval_test.go @@ -18,6 +18,7 @@ package discover import ( "net" + "sync" "testing" "time" @@ -117,3 +118,62 @@ func TestRevalidation_endpointUpdate(t *testing.T) { t.Fatal("node is marked live after endpoint change") } } + +// TestRevalidation_concurrentAddAndRun reproduces the data race between the +// Table.loop goroutine (which calls tableRevalidation.run) and the doRefresh +// goroutine (which reaches tableRevalidation.nodeAdded via handleAddNode) on +// revalidationList.nextTime and revalidationList.nodes. See issue #31460. +// +// Without proper locking, this test reliably flags a race under "go test -race". +func TestRevalidation_concurrentAddAndRun(t *testing.T) { + var ( + transport = newPingRecorder() + // Real clock + small ping interval so that list.schedule produces + // nextTime values close to now, and tr.run repeatedly enters the body. + tab, db = newInactiveTestTable(transport, Config{PingInterval: time.Millisecond}) + ) + defer db.Close() + + // Seed the fast list so tr.run has something to iterate over. + for i := 0; i < 16; i++ { + n := nodeAtDistance(tab.self().ID(), 255, net.IP{10, 0, 0, byte(i)}) + tab.mutex.Lock() + tab.handleAddNode(addNodeOp{node: n}) + tab.mutex.Unlock() + } + + // A barrier so both goroutines start their loops together. + start := make(chan struct{}) + var wg sync.WaitGroup + wg.Add(2) + + const iterations = 2000 + + // Background goroutine: simulate doRefresh -> loadSeedNodes -> handleAddNode, + // which reaches tr.nodeAdded and appends to list.nodes (and may write + // list.nextTime via list.schedule). + go func() { + defer wg.Done() + <-start + for i := 0; i < iterations; i++ { + n := nodeAtDistance(tab.self().ID(), 200, net.IP{11, 0, byte(i >> 8), byte(i)}) + tab.mutex.Lock() + tab.handleAddNode(addNodeOp{node: n}) + tab.mutex.Unlock() + } + }() + + // Foreground goroutine: simulate Table.loop repeatedly calling tr.run, + // which reads list.nextTime and list.nodes and may write list.nextTime + // via list.schedule. + go func() { + defer wg.Done() + <-start + for i := 0; i < iterations; i++ { + tab.revalidation.run(tab, mclock.System{}.Now()) + } + }() + + close(start) + wg.Wait() +}