This commit is contained in:
Moshe Malawach 2026-06-18 21:55:14 -07:00 committed by GitHub
commit 7c55e55ad5
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 72 additions and 7 deletions

View file

@ -76,8 +76,16 @@ func (tr *tableRevalidation) nodeEndpointChanged(tab *Table, n *tableNode) {
// run performs node revalidation.
// It returns the next time it should be invoked, which is used in the Table main loop
// to schedule a timer. However, run can be called at any time.
//
// run acquires tab.mutex to synchronize with node additions performed from
// the doRefresh background goroutine, which reach this file through
// tab.handleAddNode -> tr.nodeAdded -> list.push -> list.schedule and therefore
// mutate the same revalidationList fields that run reads.
func (tr *tableRevalidation) run(tab *Table, now mclock.AbsTime) (nextTime mclock.AbsTime) {
reval := func(list *revalidationList) {
tab.mutex.Lock()
defer tab.mutex.Unlock()
runList := func(list *revalidationList) {
if list.nextTime <= now {
if n := list.get(&tab.rand, tr.activeReq); n != nil {
tr.startRequest(tab, n)
@ -87,24 +95,21 @@ func (tr *tableRevalidation) run(tab *Table, now mclock.AbsTime) (nextTime mcloc
list.schedule(now, &tab.rand)
}
}
reval(&tr.fast)
reval(&tr.slow)
runList(&tr.fast)
runList(&tr.slow)
return min(tr.fast.nextTime, tr.slow.nextTime)
}
// startRequest spawns a revalidation request for node n.
// The caller must hold tab.mutex.
func (tr *tableRevalidation) startRequest(tab *Table, n *tableNode) {
if _, ok := tr.activeReq[n.ID()]; ok {
panic(fmt.Errorf("duplicate startRequest (node %v)", n.ID()))
}
tr.activeReq[n.ID()] = struct{}{}
resp := revalidationResponse{n: n}
// Fetch the node while holding lock.
tab.mutex.Lock()
node := n.Node
tab.mutex.Unlock()
go tab.doRevalidate(resp, node)
}

View file

@ -18,6 +18,7 @@ package discover
import (
"net"
"sync"
"testing"
"time"
@ -117,3 +118,62 @@ func TestRevalidation_endpointUpdate(t *testing.T) {
t.Fatal("node is marked live after endpoint change")
}
}
// TestRevalidation_concurrentAddAndRun reproduces the data race between the
// Table.loop goroutine (which calls tableRevalidation.run) and the doRefresh
// goroutine (which reaches tableRevalidation.nodeAdded via handleAddNode) on
// revalidationList.nextTime and revalidationList.nodes. See issue #31460.
//
// Without proper locking, this test reliably flags a race under "go test -race".
func TestRevalidation_concurrentAddAndRun(t *testing.T) {
var (
transport = newPingRecorder()
// Real clock + small ping interval so that list.schedule produces
// nextTime values close to now, and tr.run repeatedly enters the body.
tab, db = newInactiveTestTable(transport, Config{PingInterval: time.Millisecond})
)
defer db.Close()
// Seed the fast list so tr.run has something to iterate over.
for i := 0; i < 16; i++ {
n := nodeAtDistance(tab.self().ID(), 255, net.IP{10, 0, 0, byte(i)})
tab.mutex.Lock()
tab.handleAddNode(addNodeOp{node: n})
tab.mutex.Unlock()
}
// A barrier so both goroutines start their loops together.
start := make(chan struct{})
var wg sync.WaitGroup
wg.Add(2)
const iterations = 2000
// Background goroutine: simulate doRefresh -> loadSeedNodes -> handleAddNode,
// which reaches tr.nodeAdded and appends to list.nodes (and may write
// list.nextTime via list.schedule).
go func() {
defer wg.Done()
<-start
for i := 0; i < iterations; i++ {
n := nodeAtDistance(tab.self().ID(), 200, net.IP{11, 0, byte(i >> 8), byte(i)})
tab.mutex.Lock()
tab.handleAddNode(addNodeOp{node: n})
tab.mutex.Unlock()
}
}()
// Foreground goroutine: simulate Table.loop repeatedly calling tr.run,
// which reads list.nextTime and list.nodes and may write list.nextTime
// via list.schedule.
go func() {
defer wg.Done()
<-start
for i := 0; i < iterations; i++ {
tab.revalidation.run(tab, mclock.System{}.Now())
}
}()
close(start)
wg.Wait()
}