diff --git a/p2p/discover/table_reval.go b/p2p/discover/table_reval.go index 1519313d19..d19ce30298 100644 --- a/p2p/discover/table_reval.go +++ b/p2p/discover/table_reval.go @@ -76,8 +76,16 @@ func (tr *tableRevalidation) nodeEndpointChanged(tab *Table, n *tableNode) { // run performs node revalidation. // It returns the next time it should be invoked, which is used in the Table main loop // to schedule a timer. However, run can be called at any time. +// +// run acquires tab.mutex to synchronize with node additions performed from +// the doRefresh background goroutine, which reach this file through +// tab.handleAddNode -> tr.nodeAdded -> list.push -> list.schedule and therefore +// mutate the same revalidationList fields that run reads. func (tr *tableRevalidation) run(tab *Table, now mclock.AbsTime) (nextTime mclock.AbsTime) { - reval := func(list *revalidationList) { + tab.mutex.Lock() + defer tab.mutex.Unlock() + + runList := func(list *revalidationList) { if list.nextTime <= now { if n := list.get(&tab.rand, tr.activeReq); n != nil { tr.startRequest(tab, n) @@ -87,24 +95,21 @@ func (tr *tableRevalidation) run(tab *Table, now mclock.AbsTime) (nextTime mcloc list.schedule(now, &tab.rand) } } - reval(&tr.fast) - reval(&tr.slow) + runList(&tr.fast) + runList(&tr.slow) return min(tr.fast.nextTime, tr.slow.nextTime) } // startRequest spawns a revalidation request for node n. +// The caller must hold tab.mutex. func (tr *tableRevalidation) startRequest(tab *Table, n *tableNode) { if _, ok := tr.activeReq[n.ID()]; ok { panic(fmt.Errorf("duplicate startRequest (node %v)", n.ID())) } tr.activeReq[n.ID()] = struct{}{} resp := revalidationResponse{n: n} - - // Fetch the node while holding lock. - tab.mutex.Lock() node := n.Node - tab.mutex.Unlock() go tab.doRevalidate(resp, node) } diff --git a/p2p/discover/table_reval_test.go b/p2p/discover/table_reval_test.go index 3605443934..24d2826d05 100644 --- a/p2p/discover/table_reval_test.go +++ b/p2p/discover/table_reval_test.go @@ -18,6 +18,7 @@ package discover import ( "net" + "sync" "testing" "time" @@ -117,3 +118,62 @@ func TestRevalidation_endpointUpdate(t *testing.T) { t.Fatal("node is marked live after endpoint change") } } + +// TestRevalidation_concurrentAddAndRun reproduces the data race between the +// Table.loop goroutine (which calls tableRevalidation.run) and the doRefresh +// goroutine (which reaches tableRevalidation.nodeAdded via handleAddNode) on +// revalidationList.nextTime and revalidationList.nodes. See issue #31460. +// +// Without proper locking, this test reliably flags a race under "go test -race". +func TestRevalidation_concurrentAddAndRun(t *testing.T) { + var ( + transport = newPingRecorder() + // Real clock + small ping interval so that list.schedule produces + // nextTime values close to now, and tr.run repeatedly enters the body. + tab, db = newInactiveTestTable(transport, Config{PingInterval: time.Millisecond}) + ) + defer db.Close() + + // Seed the fast list so tr.run has something to iterate over. + for i := 0; i < 16; i++ { + n := nodeAtDistance(tab.self().ID(), 255, net.IP{10, 0, 0, byte(i)}) + tab.mutex.Lock() + tab.handleAddNode(addNodeOp{node: n}) + tab.mutex.Unlock() + } + + // A barrier so both goroutines start their loops together. + start := make(chan struct{}) + var wg sync.WaitGroup + wg.Add(2) + + const iterations = 2000 + + // Background goroutine: simulate doRefresh -> loadSeedNodes -> handleAddNode, + // which reaches tr.nodeAdded and appends to list.nodes (and may write + // list.nextTime via list.schedule). + go func() { + defer wg.Done() + <-start + for i := 0; i < iterations; i++ { + n := nodeAtDistance(tab.self().ID(), 200, net.IP{11, 0, byte(i >> 8), byte(i)}) + tab.mutex.Lock() + tab.handleAddNode(addNodeOp{node: n}) + tab.mutex.Unlock() + } + }() + + // Foreground goroutine: simulate Table.loop repeatedly calling tr.run, + // which reads list.nextTime and list.nodes and may write list.nextTime + // via list.schedule. + go func() { + defer wg.Done() + <-start + for i := 0; i < iterations; i++ { + tab.revalidation.run(tab, mclock.System{}.Now()) + } + }() + + close(start) + wg.Wait() +}