mirror of
https://github.com/ethereum/go-ethereum.git
synced 2026-05-26 17:59:29 +00:00
Add an enode.Iterator that drives discovery by issuing a single FINDNODE per discovered peer, rotating the target through Drange sub-regions of the keyspace. Compared to RandomNodes (which wraps an alpha=3 Kademlia lookup that converges on a single target), this shape is geared for breadth: each peer is asked about a different slice of the keyspace, so aggregate coverage grows quickly without per-peer overlap. The two protocols expose different FINDNODE primitives, so the iterator threads a per-protocol queryFn: * discv5 takes a list of distances natively, so we just pass [256-d] for d in 0..Drange-1. * discv4 takes a target NodeID and replies with the K closest. To get an equivalent rotation, we pick a random pubkey whose Keccak256 starts with the desired prefix nibble. With Drange=16 that's ~16 random draws per call -- negligible compared to the network round trip. Concurrency is bounded by Workers (default 16). There is intentionally no rate limit: pacing is RTT-driven, ~Workers/RTT on the wire. Termination is implicit: when the work queue is empty AND no FINDNODE is in flight, the iterator closes its output and Next returns false. Close() short-circuits this for callers that want to bail early. Adapts the algorithm from github.com/cskiraly/fast-ethereum-crawler (dcrawl.nim) -- the prefix-rotation idea -- but drops its 1000 req/s rate limit in favour of the bounded worker pool.
343 lines
9.9 KiB
Go
343 lines
9.9 KiB
Go
// Copyright 2026 The go-ethereum Authors
|
|
// This file is part of the go-ethereum library.
|
|
//
|
|
// The go-ethereum library is free software: you can redistribute it and/or modify
|
|
// it under the terms of the GNU Lesser General Public License as published by
|
|
// the Free Software Foundation, either version 3 of the License, or
|
|
// (at your option) any later version.
|
|
//
|
|
// The go-ethereum library is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU Lesser General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU Lesser General Public License
|
|
// along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
package discover
|
|
|
|
import (
|
|
crand "crypto/rand"
|
|
"sync"
|
|
"sync/atomic"
|
|
|
|
"github.com/ethereum/go-ethereum/crypto"
|
|
"github.com/ethereum/go-ethereum/p2p/discover/v4wire"
|
|
"github.com/ethereum/go-ethereum/p2p/enode"
|
|
)
|
|
|
|
// CrawlOptions configures a CrawlIterator.
|
|
type CrawlOptions struct {
|
|
// Workers is the number of concurrent FINDNODE calls in flight.
|
|
// If <= 0, a default of 16 is used.
|
|
Workers int
|
|
|
|
// Seeds are the nodes to start the crawl from. If empty, the iterator
|
|
// terminates immediately. Callers should pass at least the bootnodes.
|
|
Seeds []*enode.Node
|
|
|
|
// Drange is the number of keyspace sub-regions to rotate the FINDNODE
|
|
// target through. Defaults to 16. Internally rounded up to the next power
|
|
// of two and capped at 256 (the keyspace top-byte width); on the discv4
|
|
// path this caps the prefix-bit grind cost, on the discv5 path it caps
|
|
// the rotation to valid distances [1, 256].
|
|
Drange int
|
|
|
|
// OutputCap bounds the number of newly-discovered peers buffered for the
|
|
// caller's Next() to drain. When the buffer reaches this size, workers
|
|
// pause discovery via cond.Wait until Next() drains it. This is the
|
|
// iterator's backpressure point for slow consumers and adversarial peers
|
|
// flooding fresh ENRs in FINDNODE responses.
|
|
//
|
|
// If <= 0, defaults to 16 * Workers (≥ one full FINDNODE response per
|
|
// worker). Set higher for callers that drain in large batches.
|
|
//
|
|
// Note: the dedup set and the worker work-queue are not bounded; their
|
|
// growth is implicit in any iterator that emits each unique peer exactly
|
|
// once across a long crawl. Realistic bound is the reachable DHT size
|
|
// (~1M peers, ~50 MB).
|
|
OutputCap int
|
|
}
|
|
|
|
func (o *CrawlOptions) withDefaults() {
|
|
if o.Workers <= 0 {
|
|
o.Workers = 16
|
|
}
|
|
if o.Drange <= 0 {
|
|
o.Drange = 16
|
|
}
|
|
if o.Drange > 256 {
|
|
o.Drange = 256
|
|
}
|
|
// Round up to the next power of two so the prefix-bit width is
|
|
// well-defined and Drange divides the top byte of the keyspace evenly.
|
|
if o.Drange&(o.Drange-1) != 0 {
|
|
o.Drange = nextPowerOfTwo(o.Drange)
|
|
}
|
|
if o.OutputCap <= 0 {
|
|
o.OutputCap = 16 * o.Workers
|
|
}
|
|
}
|
|
|
|
// nextPowerOfTwo returns the smallest power of two >= n, for n in [1, 256].
|
|
func nextPowerOfTwo(n int) int {
|
|
p := 1
|
|
for p < n {
|
|
p <<= 1
|
|
}
|
|
return p
|
|
}
|
|
|
|
// CrawlIterator returns an enode.Iterator that performs a breadth-first
|
|
// crawl by issuing a single FINDNODE request per discovered peer, rotating
|
|
// the request's target through Drange sub-regions of the keyspace so that
|
|
// each peer is asked about a different slice. Compared to RandomNodes, this
|
|
// avoids the alpha-bounded Kademlia lookup convergence loop and is the right
|
|
// shape for breadth crawls (e.g. devp2p discv4 crawl).
|
|
//
|
|
// Concurrency is bounded by opts.Workers; pacing is RTT-driven, not
|
|
// rate-limited.
|
|
func (t *UDPv4) CrawlIterator(opts CrawlOptions) enode.Iterator {
|
|
opts.withDefaults()
|
|
prefixBits := log2Pow2(opts.Drange)
|
|
queryFn := func(dst *enode.Node, d int) ([]*enode.Node, error) {
|
|
addr, ok := dst.UDPEndpoint()
|
|
if !ok {
|
|
return nil, errNoUDPEndpoint
|
|
}
|
|
target := randomTargetWithPrefix(uint8(d), prefixBits)
|
|
peers, err := t.findnode(dst.ID(), addr, target)
|
|
if err != nil {
|
|
t.log.Trace("FINDNODE failed", "id", dst.ID(), "err", err)
|
|
}
|
|
return peers, err
|
|
}
|
|
return newCrawlIterator(opts, queryFn)
|
|
}
|
|
|
|
// CrawlIterator returns an enode.Iterator that performs a breadth-first
|
|
// crawl using single-distance FINDNODE requests. See [UDPv4.CrawlIterator]
|
|
// for the algorithm; the discv5 protocol takes a list of distances directly,
|
|
// so the rotation maps to distances [256, 255, ..., 256-Drange+1].
|
|
func (t *UDPv5) CrawlIterator(opts CrawlOptions) enode.Iterator {
|
|
queryFn := func(dst *enode.Node, d int) ([]*enode.Node, error) {
|
|
dist := uint(256 - d)
|
|
peers, err := t.Findnode(dst, []uint{dist})
|
|
if err != nil {
|
|
t.log.Trace("FINDNODE failed", "id", dst.ID(), "err", err)
|
|
}
|
|
return peers, err
|
|
}
|
|
return newCrawlIterator(opts, queryFn)
|
|
}
|
|
|
|
// log2Pow2 returns log2(n) for power-of-two n. The caller must ensure n is a
|
|
// power of two; non-power-of-two inputs round down.
|
|
func log2Pow2(n int) int {
|
|
bits := 0
|
|
for n > 1 {
|
|
n >>= 1
|
|
bits++
|
|
}
|
|
return bits
|
|
}
|
|
|
|
// randomTargetWithPrefix returns a v4wire.Pubkey whose Keccak256 hash has its
|
|
// top `bits` bits equal to d. On average ~2^bits draws are needed.
|
|
func randomTargetWithPrefix(d uint8, bits int) v4wire.Pubkey {
|
|
if bits == 0 {
|
|
var pk v4wire.Pubkey
|
|
crand.Read(pk[:])
|
|
return pk
|
|
}
|
|
for {
|
|
var pk v4wire.Pubkey
|
|
crand.Read(pk[:])
|
|
h := crypto.Keccak256(pk[:])
|
|
if (h[0] >> (8 - bits)) == d {
|
|
return pk
|
|
}
|
|
}
|
|
}
|
|
|
|
// crawlIterator is a breadth-first FINDNODE-driven iterator. It maintains a
|
|
// shared work queue and an output buffer; workers pop from the queue, issue
|
|
// one FINDNODE per pop, and feed any newly-seen peers back into both the
|
|
// queue and the output buffer. The iterator terminates when the queue is
|
|
// empty and no FINDNODE call is in flight.
|
|
type crawlIterator struct {
|
|
queryFn func(dst *enode.Node, d int) ([]*enode.Node, error)
|
|
drange int
|
|
outputCap int
|
|
wg sync.WaitGroup
|
|
|
|
mu sync.Mutex
|
|
cond *sync.Cond
|
|
queue []*enode.Node // pending FINDNODE work
|
|
output []*enode.Node // emitted peers (one-time)
|
|
discovered map[enode.ID]struct{}
|
|
inflight int // queued + in-progress
|
|
closing bool // Close() called or natural termination
|
|
cur *enode.Node
|
|
|
|
rotation atomic.Uint64
|
|
}
|
|
|
|
func newCrawlIterator(opts CrawlOptions, queryFn func(*enode.Node, int) ([]*enode.Node, error)) *crawlIterator {
|
|
opts.withDefaults()
|
|
it := &crawlIterator{
|
|
queryFn: queryFn,
|
|
drange: opts.Drange,
|
|
outputCap: opts.OutputCap,
|
|
discovered: make(map[enode.ID]struct{}),
|
|
}
|
|
it.cond = sync.NewCond(&it.mu)
|
|
|
|
// Seed directly into the queue/output. Going through discover() would
|
|
// block on the OutputCap if len(Seeds) > OutputCap, deadlocking the
|
|
// constructor since workers haven't started and Next() hasn't been
|
|
// called yet.
|
|
for _, n := range opts.Seeds {
|
|
if n == nil {
|
|
continue
|
|
}
|
|
if _, seen := it.discovered[n.ID()]; seen {
|
|
continue
|
|
}
|
|
it.discovered[n.ID()] = struct{}{}
|
|
it.queue = append(it.queue, n)
|
|
it.output = append(it.output, n)
|
|
it.inflight++
|
|
}
|
|
|
|
// Workers.
|
|
for i := 0; i < opts.Workers; i++ {
|
|
it.wg.Add(1)
|
|
go it.worker()
|
|
}
|
|
return it
|
|
}
|
|
|
|
// discover records a newly-seen peer. Acquires mu internally; callers
|
|
// MUST NOT hold it. If output is at capacity, waits on cond until Next()
|
|
// drains it; this is the iterator's backpressure point.
|
|
func (it *crawlIterator) discover(n *enode.Node) {
|
|
if n == nil {
|
|
return
|
|
}
|
|
it.mu.Lock()
|
|
defer it.mu.Unlock()
|
|
for {
|
|
if it.closing {
|
|
return
|
|
}
|
|
if _, seen := it.discovered[n.ID()]; seen {
|
|
return
|
|
}
|
|
if it.outputCap > 0 && len(it.output) >= it.outputCap {
|
|
// Pause discovery until the consumer drains output. Releases mu
|
|
// while waiting so other workers can keep popping from queue and
|
|
// the consumer can pop from output.
|
|
it.cond.Wait()
|
|
continue
|
|
}
|
|
break
|
|
}
|
|
it.discovered[n.ID()] = struct{}{}
|
|
it.queue = append(it.queue, n)
|
|
it.output = append(it.output, n)
|
|
it.inflight++
|
|
it.cond.Broadcast()
|
|
}
|
|
|
|
// popWork blocks until either a peer is available to query, or the iterator
|
|
// has nothing left to do. Returns (nil, false) on termination.
|
|
func (it *crawlIterator) popWork() (*enode.Node, bool) {
|
|
it.mu.Lock()
|
|
defer it.mu.Unlock()
|
|
for {
|
|
if it.closing {
|
|
return nil, false
|
|
}
|
|
if len(it.queue) > 0 {
|
|
n := it.queue[0]
|
|
it.queue = it.queue[1:]
|
|
return n, true
|
|
}
|
|
if it.inflight == 0 {
|
|
// Queue empty AND nothing in flight: natural termination.
|
|
it.closing = true
|
|
it.cond.Broadcast()
|
|
return nil, false
|
|
}
|
|
it.cond.Wait()
|
|
}
|
|
}
|
|
|
|
// finishWork is called by workers after their FINDNODE response has been
|
|
// processed. It decrements the in-flight counter and broadcasts so a possibly
|
|
// idle worker can re-evaluate termination.
|
|
func (it *crawlIterator) finishWork() {
|
|
it.mu.Lock()
|
|
defer it.mu.Unlock()
|
|
it.inflight--
|
|
if it.inflight == 0 && len(it.queue) == 0 {
|
|
it.closing = true
|
|
it.cond.Broadcast()
|
|
}
|
|
}
|
|
|
|
func (it *crawlIterator) worker() {
|
|
defer it.wg.Done()
|
|
for {
|
|
n, ok := it.popWork()
|
|
if !ok {
|
|
return
|
|
}
|
|
d := int(it.rotation.Add(1)-1) % it.drange
|
|
peers, _ := it.queryFn(n, d)
|
|
for _, p := range peers {
|
|
it.discover(p)
|
|
}
|
|
it.finishWork()
|
|
}
|
|
}
|
|
|
|
// Next blocks until a newly-discovered peer is available, then returns true
|
|
// and makes the peer accessible via Node. Returns false when the iterator
|
|
// has terminated.
|
|
func (it *crawlIterator) Next() bool {
|
|
it.mu.Lock()
|
|
defer it.mu.Unlock()
|
|
for len(it.output) == 0 {
|
|
if it.closing {
|
|
return false
|
|
}
|
|
it.cond.Wait()
|
|
}
|
|
it.cur = it.output[0]
|
|
it.output = it.output[1:]
|
|
// Wake any worker stalled in discover() because output was at capacity.
|
|
it.cond.Broadcast()
|
|
return true
|
|
}
|
|
|
|
// Node returns the most recent peer surfaced by Next.
|
|
func (it *crawlIterator) Node() *enode.Node {
|
|
it.mu.Lock()
|
|
defer it.mu.Unlock()
|
|
return it.cur
|
|
}
|
|
|
|
// Close terminates the iterator, unblocking any goroutines waiting in Next.
|
|
// Workers exit at their next poll point; in-flight FINDNODE responses are
|
|
// dropped.
|
|
func (it *crawlIterator) Close() {
|
|
it.mu.Lock()
|
|
if !it.closing {
|
|
it.closing = true
|
|
it.cond.Broadcast()
|
|
}
|
|
it.mu.Unlock()
|
|
it.wg.Wait()
|
|
}
|