p2p/discover: document BFS choice, add RandomWorkers split

Two related changes to CrawlIterator:

(1) Add a file-level commentary block explaining why the iterator uses a
FIFO queue (BFS over the FINDNODE-response graph) and what it is *not*
suitable for (target-directed lookup -- use RandomNodes() / the alpha=3
lookup iterator for that). The choice was inherited from dcrawl.nim
without explicit reasoning; making it visible avoids future readers
re-deriving the survey-vs-lookup distinction.

The BFS rationale is two-fold:

 - Coverage: BFS reaches every peer within N hops of the seeds in
   order, so a time-bounded run produces a representative sample of the
   reachable graph rather than a deep tendril through one sub-region.
 - Adversarial resilience: a peer returning malicious "neighbour"
   claims, dead-end peers, or eclipse-style sub-graphs cannot
   monopolise the worker pool, because pending work from other branches
   sits ahead of the attacker's responses in the queue. DFS would
   amplify each of these attacks.

(2) Add a RandomWorkers field to CrawlOptions. Of the Workers-sized
worker pool, the first (Workers - RandomWorkers) workers pop the FIFO
front (BFS), while RandomWorkers workers pop a uniform-random queue
index via swap-and-pop (O(1)). Total worker count is unchanged.

Default RandomWorkers = Workers / 4 (4 of 16 with the default
parallelism). At this ratio:

 - Cold-start cost is negligible: 12 of 16 workers still drain FIFO,
   so the first ~1s of a fresh crawl behaves like pure BFS.
 - 25% of pops break strict FIFO ordering, providing a mild
   anti-fingerprint defence against an attacker who could otherwise
   predict our processing order from the contents of their own
   FINDNODE responses.

Operators can override per-run via the new --random-workers CLI flag
on `devp2p discv4 crawl` and `discv5 crawl`. Negative value forces
pure BFS; positive value selects an explicit count.

The new TestCrawlIteratorRandomWorkers covers four pop-policy
configurations (all-fifo, all-random, half-half, default) and
asserts the iterator still terminates and emits each node exactly
once in each.
This commit is contained in:
Csaba Kiraly 2026-05-07 09:38:59 +02:00
parent b026ef6bb7
commit 33785aab21
No known key found for this signature in database
4 changed files with 164 additions and 16 deletions

View file

@ -91,7 +91,7 @@ var (
Name: "crawl",
Usage: "Updates a nodes.json file with random nodes found in the DHT",
Action: discv4Crawl,
Flags: slices.Concat(discoveryNodeFlags, []cli.Flag{crawlTimeoutFlag, crawlParallelismFlag, crawlModeFlag}),
Flags: slices.Concat(discoveryNodeFlags, []cli.Flag{crawlTimeoutFlag, crawlParallelismFlag, crawlModeFlag, crawlRandomWorkersFlag}),
}
discv4TestCommand = &cli.Command{
Name: "test",
@ -143,6 +143,11 @@ var (
Usage: "Crawl iterator mode: 'lookup' (alpha-bounded Kademlia lookup) or 'fast' (one FINDNODE per peer with rotating prefix; sized by -parallel).",
Value: "lookup",
}
crawlRandomWorkersFlag = &cli.IntFlag{
Name: "random-workers",
Usage: "Of the -parallel workers in -mode=fast, how many pop a random queue item rather than the FIFO front. 0 = library default (parallel/4); negative = pure BFS.",
Value: 0,
}
remoteEnodeFlag = &cli.StringFlag{
Name: "remote",
Usage: "Enode of the remote node under test",
@ -264,7 +269,7 @@ func discv4Crawl(ctx *cli.Context) error {
disc, config := startV4(ctx)
defer disc.Close()
iter, err := newDiscv4CrawlIterator(disc, config.Bootnodes, ctx.String(crawlModeFlag.Name), ctx.Int(crawlParallelismFlag.Name))
iter, err := newDiscv4CrawlIterator(disc, config.Bootnodes, ctx.String(crawlModeFlag.Name), ctx.Int(crawlParallelismFlag.Name), ctx.Int(crawlRandomWorkersFlag.Name))
if err != nil {
return err
}
@ -278,14 +283,15 @@ func discv4Crawl(ctx *cli.Context) error {
return nil
}
func newDiscv4CrawlIterator(disc *discover.UDPv4, bootnodes []*enode.Node, mode string, parallel int) (enode.Iterator, error) {
func newDiscv4CrawlIterator(disc *discover.UDPv4, bootnodes []*enode.Node, mode string, parallel, randomWorkers int) (enode.Iterator, error) {
switch mode {
case "", "lookup":
return disc.RandomNodes(), nil
case "fast":
return disc.CrawlIterator(discover.CrawlOptions{
Workers: parallel,
Seeds: bootnodes,
Workers: parallel,
RandomWorkers: randomWorkers,
Seeds: bootnodes,
}), nil
default:
return nil, fmt.Errorf("unknown -%s value %q (want 'lookup' or 'fast')", crawlModeFlag.Name, mode)

View file

@ -61,6 +61,7 @@ var (
crawlTimeoutFlag,
crawlParallelismFlag,
crawlModeFlag,
crawlRandomWorkersFlag,
}),
}
discv5TestCommand = &cli.Command{
@ -114,7 +115,7 @@ func discv5Crawl(ctx *cli.Context) error {
disc, config := startV5(ctx)
defer disc.Close()
iter, err := newDiscv5CrawlIterator(disc, config.Bootnodes, ctx.String(crawlModeFlag.Name), ctx.Int(crawlParallelismFlag.Name))
iter, err := newDiscv5CrawlIterator(disc, config.Bootnodes, ctx.String(crawlModeFlag.Name), ctx.Int(crawlParallelismFlag.Name), ctx.Int(crawlRandomWorkersFlag.Name))
if err != nil {
return err
}
@ -128,14 +129,15 @@ func discv5Crawl(ctx *cli.Context) error {
return nil
}
func newDiscv5CrawlIterator(disc *discover.UDPv5, bootnodes []*enode.Node, mode string, parallel int) (enode.Iterator, error) {
func newDiscv5CrawlIterator(disc *discover.UDPv5, bootnodes []*enode.Node, mode string, parallel, randomWorkers int) (enode.Iterator, error) {
switch mode {
case "", "lookup":
return disc.RandomNodes(), nil
case "fast":
return disc.CrawlIterator(discover.CrawlOptions{
Workers: parallel,
Seeds: bootnodes,
Workers: parallel,
RandomWorkers: randomWorkers,
Seeds: bootnodes,
}), nil
default:
return nil, fmt.Errorf("unknown -%s value %q (want 'lookup' or 'fast')", crawlModeFlag.Name, mode)

View file

@ -18,6 +18,7 @@ package discover
import (
crand "crypto/rand"
"math/rand/v2"
"sync"
"sync/atomic"
@ -25,6 +26,37 @@ import (
"github.com/ethereum/go-ethereum/p2p/enode"
)
// CrawlIterator performs a breadth-first crawl of the discv4/discv5
// FINDNODE-response graph. Each worker pops a peer from the work queue,
// issues one FINDNODE call against it, and feeds any newly-seen peers
// back into the queue (and the iterator's output buffer).
//
// BFS (FIFO queue) was chosen over DFS or target-directed lookup for two
// reasons that align with the iterator's intended use case (survey-style
// crawls — devp2p crawl, geth dial-candidate discovery):
//
// - Coverage: BFS reaches every peer within N hops of the seeds in
// order, so a time-bounded run produces a representative sample of
// the reachable graph rather than a deep tendril through one
// sub-region.
// - Adversarial resilience: a peer returning malicious "neighbour"
// claims, dead-end peers, or eclipse-style sub-graphs cannot
// monopolise the worker pool, because pending work from other
// branches sits ahead of the attacker's responses in the queue.
// DFS would amplify each of these attacks.
//
// CrawlIterator is NOT a target-directed search. To find a specific node
// (or the K closest to a known target), use the lookup-based iterator
// returned by [UDPv4.RandomNodes] / [UDPv5.RandomNodes] instead — those
// run an alpha-bounded Kademlia lookup, which is the right shape for
// "find peer X" but the wrong shape for "survey the network".
//
// Pop policy can be tuned via [CrawlOptions.RandomWorkers]: any subset
// of the worker pool can be configured to pop a uniform-random queue
// item instead of the FIFO front, breaking strict ordering as a mild
// anti-fingerprint defence. The remaining workers pop FIFO and preserve
// the BFS character. See the field doc for details.
// CrawlOptions configures a CrawlIterator.
type CrawlOptions struct {
// Workers is the number of concurrent FINDNODE calls in flight.
@ -58,6 +90,23 @@ type CrawlOptions struct {
// once across a long crawl. Realistic bound is the reachable DHT size
// (~1M peers, ~50 MB).
OutputCap int
// RandomWorkers is the number of workers in the pool that pop a uniform-
// random queue item instead of the FIFO front. The remaining
// Workers - RandomWorkers workers behave as today, popping front. Total
// worker count is unchanged at Workers; only the BFS-vs-random split
// differs.
//
// Zero (the struct zero-value) selects the library default of
// Workers/4 rounded down. Pass a negative value (e.g. -1) to force
// pure BFS with zero random workers. A positive value > Workers is
// clamped to Workers (i.e. fully-random pop with no FIFO workers).
//
// The default Workers/4 mix preserves the BFS coverage character (cold
// start is dominated by the FIFO majority) while breaking strict FIFO
// ordering, a mild anti-fingerprint defence. See the file-level comment
// for details.
RandomWorkers int
}
func (o *CrawlOptions) withDefaults() {
@ -73,6 +122,17 @@ func (o *CrawlOptions) withDefaults() {
if o.OutputCap <= 0 {
o.OutputCap = 16 * o.Workers
}
switch {
case o.RandomWorkers < 0:
// Negative means: explicit pure BFS, no random workers.
o.RandomWorkers = 0
case o.RandomWorkers == 0:
// Zero (struct zero-value) means: library default.
o.RandomWorkers = o.Workers / 4
case o.RandomWorkers > o.Workers:
// Clamp; can't have more random workers than total workers.
o.RandomWorkers = o.Workers
}
}
// CrawlIterator returns an enode.Iterator that performs a breadth-first
@ -166,10 +226,17 @@ func newCrawlIterator(opts CrawlOptions, queryFn func(*enode.Node, int) ([]*enod
it.inflight++
}
// Workers.
for i := 0; i < opts.Workers; i++ {
// Workers. The first (Workers - RandomWorkers) workers pop FIFO; the
// remaining RandomWorkers pop a uniform-random queue index. Both share
// the same queue, mutex, cond, and termination semantics.
fifoCount := opts.Workers - opts.RandomWorkers
for i := 0; i < fifoCount; i++ {
it.wg.Add(1)
go it.worker()
go it.worker(false)
}
for i := 0; i < opts.RandomWorkers; i++ {
it.wg.Add(1)
go it.worker(true)
}
return it
}
@ -207,8 +274,10 @@ func (it *crawlIterator) discover(n *enode.Node) {
}
// popWork blocks until either a peer is available to query, or the iterator
// has nothing left to do. Returns (nil, false) on termination.
func (it *crawlIterator) popWork() (*enode.Node, bool) {
// has nothing left to do. Returns (nil, false) on termination. If random is
// true, pops a uniform-random index via swap-and-pop; otherwise pops the
// FIFO front. Both modes share the same termination semantics.
func (it *crawlIterator) popWork(random bool) (*enode.Node, bool) {
it.mu.Lock()
defer it.mu.Unlock()
for {
@ -216,6 +285,14 @@ func (it *crawlIterator) popWork() (*enode.Node, bool) {
return nil, false
}
if len(it.queue) > 0 {
if random {
i := rand.IntN(len(it.queue))
n := it.queue[i]
// Swap-and-pop: O(1) removal from middle.
it.queue[i] = it.queue[len(it.queue)-1]
it.queue = it.queue[:len(it.queue)-1]
return n, true
}
n := it.queue[0]
it.queue = it.queue[1:]
return n, true
@ -243,10 +320,10 @@ func (it *crawlIterator) finishWork() {
}
}
func (it *crawlIterator) worker() {
func (it *crawlIterator) worker(random bool) {
defer it.wg.Done()
for {
n, ok := it.popWork()
n, ok := it.popWork(random)
if !ok {
return
}

View file

@ -223,6 +223,69 @@ func TestCrawlIteratorOutputCap(t *testing.T) {
}
}
// TestCrawlIteratorRandomWorkers verifies that with a mixed FIFO + random
// worker pool, the iterator still terminates and emits each node exactly
// once on the synthetic graph from TestCrawlIteratorTerminates.
func TestCrawlIteratorRandomWorkers(t *testing.T) {
for _, tc := range []struct {
name string
workers int
randomWorkers int
}{
{"all-fifo", 4, -1}, // -1 → pure BFS
{"all-random", 4, 4}, // all workers random-pop
{"half-half", 8, 4}, // 4 FIFO + 4 random
{"default", 16, 0}, // 0 → library default = 4 random of 16
} {
t.Run(tc.name, func(t *testing.T) {
nodes := makeTestNodes(t, 50)
neighbours := make(map[enode.ID][]*enode.Node, len(nodes))
for i, n := range nodes {
var ns []*enode.Node
for k := 1; k <= 5; k++ {
ns = append(ns, nodes[(i+k)%len(nodes)])
}
neighbours[n.ID()] = ns
}
var calls atomic.Int64
queryFn := func(dst *enode.Node, _ int) ([]*enode.Node, error) {
calls.Add(1)
return neighbours[dst.ID()], nil
}
it := newCrawlIterator(CrawlOptions{
Workers: tc.workers,
RandomWorkers: tc.randomWorkers,
Seeds: []*enode.Node{nodes[0]},
Drange: 16,
}, queryFn)
seen := make(map[enode.ID]int)
done := make(chan struct{})
go func() {
for it.Next() {
seen[it.Node().ID()]++
}
close(done)
}()
select {
case <-done:
case <-time.After(5 * time.Second):
t.Fatal("iterator did not terminate within 5s")
}
if got := len(seen); got != len(nodes) {
t.Fatalf("emitted %d distinct nodes, want %d", got, len(nodes))
}
for id, c := range seen {
if c != 1 {
t.Errorf("node %x emitted %d times, want 1", id[:4], c)
}
}
if got, want := calls.Load(), int64(len(nodes)); got != want {
t.Errorf("queryFn invoked %d times, want %d", got, want)
}
})
}
}
// TestCrawlIteratorRotation verifies that the d argument passed to queryFn
// rotates through 0..Drange-1.
func TestCrawlIteratorRotation(t *testing.T) {