diff --git a/cmd/utils/flags.go b/cmd/utils/flags.go index c41cf4ee40..7cc2c5aaba 100644 --- a/cmd/utils/flags.go +++ b/cmd/utils/flags.go @@ -58,6 +58,7 @@ import ( "github.com/ethereum/go-ethereum/graphql" "github.com/ethereum/go-ethereum/internal/ethapi" "github.com/ethereum/go-ethereum/internal/flags" + "github.com/ethereum/go-ethereum/internal/memlimit" "github.com/ethereum/go-ethereum/log" "github.com/ethereum/go-ethereum/metrics" "github.com/ethereum/go-ethereum/metrics/exp" @@ -74,7 +75,6 @@ import ( "github.com/ethereum/go-ethereum/triedb/hashdb" "github.com/ethereum/go-ethereum/triedb/pathdb" pcsclite "github.com/gballet/go-libpcsclite" - gopsutil "github.com/shirou/gopsutil/mem" "github.com/urfave/cli/v2" ) @@ -1751,16 +1751,18 @@ func SetEthConfig(ctx *cli.Context, stack *node.Node, cfg *ethconfig.Config) { setMiner(ctx, &cfg.Miner) setRequiredBlocks(ctx, cfg) - // Cap the cache allowance and tune the garbage collector - mem, err := gopsutil.VirtualMemory() - if err == nil { - if 32<<(^uintptr(0)>>63) == 32 && mem.Total > 2*1024*1024*1024 { - log.Warn("Lowering memory allowance on 32bit arch", "available", mem.Total/1024/1024, "addressable", 2*1024) - mem.Total = 2 * 1024 * 1024 * 1024 + // Cap the cache allowance and tune the garbage collector against + // the effective memory limit (cgroup-imposed when running in a + // container, total system memory otherwise). + total, source := memlimit.Limit() + if total > 0 { + if 32<<(^uintptr(0)>>63) == 32 && total > 2*1024*1024*1024 { + log.Warn("Lowering memory allowance on 32bit arch", "available", total/1024/1024, "addressable", 2*1024) + total = 2 * 1024 * 1024 * 1024 } - allowance := int(mem.Total / 1024 / 1024 / 3) + allowance := int(total / 1024 / 1024 / 3) if cache := ctx.Int(CacheFlag.Name); cache > allowance { - log.Warn("Sanitizing cache to Go's GC limits", "provided", cache, "updated", allowance) + log.Warn("Sanitizing cache to Go's GC limits", "source", source, "provided", cache, "updated", allowance) ctx.Set(CacheFlag.Name, strconv.Itoa(allowance)) } } @@ -1775,14 +1777,14 @@ func SetEthConfig(ctx *cli.Context, stack *node.Node, cfg *ethconfig.Config) { cfg.SyncMode = ethconfig.FullSync // dev sync target forces full sync } else if ctx.IsSet(SyncModeFlag.Name) { value := ctx.String(SyncModeFlag.Name) - if err = cfg.SyncMode.UnmarshalText([]byte(value)); err != nil { + if err := cfg.SyncMode.UnmarshalText([]byte(value)); err != nil { Fatalf("--%v: %v", SyncModeFlag.Name, err) } } if ctx.IsSet(ChainHistoryFlag.Name) { value := ctx.String(ChainHistoryFlag.Name) - if err = cfg.HistoryMode.UnmarshalText([]byte(value)); err != nil { + if err := cfg.HistoryMode.UnmarshalText([]byte(value)); err != nil { Fatalf("--%s: %v", ChainHistoryFlag.Name, err) } } diff --git a/internal/memlimit/probe.go b/internal/memlimit/probe.go new file mode 100644 index 0000000000..f9e1845da2 --- /dev/null +++ b/internal/memlimit/probe.go @@ -0,0 +1,54 @@ +// Copyright 2026 The go-ethereum Authors +// This file is part of the go-ethereum library. +// +// The go-ethereum library is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// The go-ethereum library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with the go-ethereum library. If not, see . + +// Package memlimit detects the effective memory limit visible to the +// current process. +// +// On Linux, /proc/meminfo always reports the host's RAM, even inside +// a container, so it cannot be used as a budget on its own. Limit +// consults the kernel-enforced cgroup limit first (v2 memory.max or +// v1 memory.limit_in_bytes), walking parent cgroups if a leaf node +// reports no limit. If no container-style limit is in effect, or on +// platforms without cgroups, it falls back to total system memory. +package memlimit + +import ( + gopsutil "github.com/shirou/gopsutil/mem" +) + +// Source identifies which mechanism produced the limit value. +type Source string + +const ( + SourceCgroupV2 Source = "cgroup-v2" + SourceCgroupV1 Source = "cgroup-v1" + SourceSystem Source = "system" + SourceUnknown Source = "unknown" +) + +// Limit returns the memory limit visible to this process in bytes and +// the source that produced it. The returned value is the tightest +// budget detectable, falling back to system total memory if no +// container-style limit is in effect. +func Limit() (bytes uint64, source Source) { + if v, src, ok := platformLimit(); ok { + return v, src + } + if mem, err := gopsutil.VirtualMemory(); err == nil { + return mem.Total, SourceSystem + } + return 0, SourceUnknown +} diff --git a/internal/memlimit/probe_linux.go b/internal/memlimit/probe_linux.go new file mode 100644 index 0000000000..f3a7e01684 --- /dev/null +++ b/internal/memlimit/probe_linux.go @@ -0,0 +1,177 @@ +// Copyright 2026 The go-ethereum Authors +// This file is part of the go-ethereum library. +// +// The go-ethereum library is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// The go-ethereum library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with the go-ethereum library. If not, see . + +//go:build linux + +package memlimit + +import ( + "os" + "path" + "strconv" + "strings" +) + +// cgroupV1UnlimitedThreshold separates real limits from the kernel's +// "no limit" sentinel. The cgroup v1 memory controller stores limits +// in pages and returns LONG_MAX/PAGE_SIZE*PAGE_SIZE for unlimited, +// so the exact sentinel depends on the kernel's page size (4 KiB on +// most architectures, 16 KiB and 64 KiB also seen on arm64 and +// ppc64le). Treating any value above 1<<62 as unlimited covers every +// page size while staying well above any plausible real limit. +const cgroupV1UnlimitedThreshold = uint64(1) << 62 + +// fileReader reads the contents of a path. Injected for testing; the +// production reader is os.ReadFile. +type fileReader func(path string) ([]byte, error) + +func platformLimit() (uint64, Source, bool) { + return detectLinuxLimit(os.ReadFile) +} + +func detectLinuxLimit(read fileReader) (uint64, Source, bool) { + if v, ok := cgroupV2Limit(read); ok { + return v, SourceCgroupV2, true + } + if v, ok := cgroupV1Limit(read); ok { + return v, SourceCgroupV1, true + } + return 0, "", false +} + +// cgroupV2Limit reads the cgroup v2 memory.max for the current process. +// +// Two paths are considered, in order: +// +// 1. /sys/fs/cgroup/memory.max: what a process running in its own +// cgroup namespace (Docker default since 20.10, all modern k8s) +// sees as its effective root. This catches the common case +// without parsing /proc/self/cgroup. +// +// 2. /sys/fs/cgroup/memory.max where comes from +// /proc/self/cgroup. This handles bare-metal Linux where the +// limit is set on a systemd slice or other ancestor cgroup. +// +// In both cases we walk up parent cgroups whenever a node reports +// "max", because the limit may be set on an ancestor. +func cgroupV2Limit(read fileReader) (uint64, bool) { + if v, ok := readCgroupV2At("/sys/fs/cgroup", "/", read); ok { + return v, true + } + procPath, ok := readProcSelfCgroupV2(read) + if !ok || procPath == "/" { + return 0, false + } + return readCgroupV2At("/sys/fs/cgroup", procPath, read) +} + +// readCgroupV2At reads memory.max under root+rel, walking up through +// parents until a numeric value is found or the path bottoms out at +// the cgroup root. Returns the first non-"max" value encountered. +func readCgroupV2At(root, rel string, read fileReader) (uint64, bool) { + // Detect that v2 is mounted at root by checking for any of the v2 + // hallmark files. cgroup.controllers exists only on v2. + if _, err := read(path.Join(root, "cgroup.controllers")); err != nil { + return 0, false + } + for { + raw, err := read(path.Join(root, rel, "memory.max")) + if err == nil { + s := strings.TrimSpace(string(raw)) + if s != "max" { + // A numeric zero is degenerate (the kernel would + // kill anything that allocates) but legal to write; + // treat it the same as v1 treats a zero leaf, walking + // up looking for a meaningful ancestor limit. + if n, err := strconv.ParseUint(s, 10, 64); err == nil && n != 0 { + return n, true + } + } + } + if rel == "/" || rel == "" { + return 0, false + } + rel = path.Dir(rel) + } +} + +// readProcSelfCgroupV2 returns the cgroup path for the current process +// from a v2-format /proc/self/cgroup line ("0::"). Returns ok=false +// for v1-only systems or parse failure. +func readProcSelfCgroupV2(read fileReader) (string, bool) { + raw, err := read("/proc/self/cgroup") + if err != nil { + return "", false + } + for line := range strings.SplitSeq(strings.TrimSpace(string(raw)), "\n") { + // v2 unified line: "0::" + if strings.HasPrefix(line, "0::") { + return strings.TrimPrefix(line, "0::"), true + } + } + return "", false +} + +// cgroupV1Limit reads memory.limit_in_bytes from the v1 memory +// controller for this process. Walks parent cgroups when a node +// reports the unlimited sentinel. +func cgroupV1Limit(read fileReader) (uint64, bool) { + rel, ok := readProcSelfCgroupV1Memory(read) + if !ok { + return 0, false + } + root := "/sys/fs/cgroup/memory" + // Sanity-check that v1 is mounted; if not, give up. + if _, err := read(path.Join(root, "memory.limit_in_bytes")); err != nil { + return 0, false + } + for { + raw, err := read(path.Join(root, rel, "memory.limit_in_bytes")) + if err == nil { + if n, err := strconv.ParseUint(strings.TrimSpace(string(raw)), 10, 64); err == nil { + if n != 0 && n < cgroupV1UnlimitedThreshold { + return n, true + } + } + } + if rel == "/" || rel == "" { + return 0, false + } + rel = path.Dir(rel) + } +} + +// readProcSelfCgroupV1Memory parses /proc/self/cgroup for the v1 memory +// controller line (":memory:" or ":...,memory,...:"). +func readProcSelfCgroupV1Memory(read fileReader) (string, bool) { + raw, err := read("/proc/self/cgroup") + if err != nil { + return "", false + } + for line := range strings.SplitSeq(strings.TrimSpace(string(raw)), "\n") { + // Format: "::" + parts := strings.SplitN(line, ":", 3) + if len(parts) != 3 { + continue + } + for ctrl := range strings.SplitSeq(parts[1], ",") { + if ctrl == "memory" { + return parts[2], true + } + } + } + return "", false +} diff --git a/internal/memlimit/probe_linux_test.go b/internal/memlimit/probe_linux_test.go new file mode 100644 index 0000000000..12ff427cd9 --- /dev/null +++ b/internal/memlimit/probe_linux_test.go @@ -0,0 +1,179 @@ +// Copyright 2026 The go-ethereum Authors +// This file is part of the go-ethereum library. +// +// The go-ethereum library is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// The go-ethereum library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with the go-ethereum library. If not, see . + +//go:build linux + +package memlimit + +import ( + "os" + "testing" +) + +// fakeFS is a fileReader backed by an in-memory map. Missing keys +// return os.ErrNotExist so the production code paths see the same +// errors they would on a real filesystem. +type fakeFS map[string]string + +func (f fakeFS) read(path string) ([]byte, error) { + v, ok := f[path] + if !ok { + return nil, os.ErrNotExist + } + return []byte(v), nil +} + +func TestDetectLinuxLimitCgroupV2Container(t *testing.T) { + // Common modern Docker scenario: cgroup namespace makes the + // container see /sys/fs/cgroup as its own cgroup root, so + // memory.max sits directly there. /proc/self/cgroup says "/". + fs := fakeFS{ + "/sys/fs/cgroup/cgroup.controllers": "memory cpu io", + "/sys/fs/cgroup/memory.max": "536870912", + "/proc/self/cgroup": "0::/", + } + bytes, src, ok := detectLinuxLimit(fs.read) + if !ok || bytes != 536870912 || src != SourceCgroupV2 { + t.Errorf("got (%d, %s, %v), want (536870912, cgroup-v2, true)", bytes, src, ok) + } +} + +func TestDetectLinuxLimitCgroupV2Unlimited(t *testing.T) { + fs := fakeFS{ + "/sys/fs/cgroup/cgroup.controllers": "memory cpu io", + "/sys/fs/cgroup/memory.max": "max", + "/proc/self/cgroup": "0::/", + } + _, _, ok := detectLinuxLimit(fs.read) + if ok { + t.Errorf("expected ok=false for fully unlimited v2 hierarchy") + } +} + +func TestDetectLinuxLimitCgroupV2LimitOnAncestor(t *testing.T) { + // Bare-metal systemd: leaf cgroup has no limit but the + // containing slice does. The walk-up must find it. + fs := fakeFS{ + "/sys/fs/cgroup/cgroup.controllers": "memory cpu io", + "/sys/fs/cgroup/memory.max": "max", + "/sys/fs/cgroup/system.slice/memory.max": "8589934592", + "/sys/fs/cgroup/system.slice/geth.service/memory.max": "max", + "/proc/self/cgroup": "0::/system.slice/geth.service", + } + bytes, src, ok := detectLinuxLimit(fs.read) + if !ok || bytes != 8589934592 || src != SourceCgroupV2 { + t.Errorf("got (%d, %s, %v), want (8589934592, cgroup-v2, true)", bytes, src, ok) + } +} + +func TestDetectLinuxLimitCgroupV2PrefersDirectRoot(t *testing.T) { + // The direct probe at /sys/fs/cgroup/memory.max is consulted + // before any walk derived from /proc/self/cgroup. In a + // namespaced container that direct read is the right answer + // even if /proc/self/cgroup happens to show a host-side path + // whose ancestor cgroups have no limit. + fs := fakeFS{ + "/sys/fs/cgroup/cgroup.controllers": "memory cpu io", + "/sys/fs/cgroup/memory.max": "536870912", + "/sys/fs/cgroup/system.slice/memory.max": "max", + "/proc/self/cgroup": "0::/system.slice/docker-abc.scope", + } + bytes, _, ok := detectLinuxLimit(fs.read) + if !ok || bytes != 536870912 { + t.Errorf("got (%d, ok=%v), want (536870912, true)", bytes, ok) + } +} + +func TestDetectLinuxLimitCgroupV1(t *testing.T) { + fs := fakeFS{ + // no v2 hallmark file + "/sys/fs/cgroup/memory/memory.limit_in_bytes": "9223372036854771712", + "/sys/fs/cgroup/memory/docker/abc/memory.limit_in_bytes": "1073741824", + "/proc/self/cgroup": "12:memory:/docker/abc\n11:cpu:/docker/abc", + } + bytes, src, ok := detectLinuxLimit(fs.read) + if !ok || bytes != 1073741824 || src != SourceCgroupV1 { + t.Errorf("got (%d, %s, %v), want (1073741824, cgroup-v1, true)", bytes, src, ok) + } +} + +func TestDetectLinuxLimitCgroupV1Unlimited(t *testing.T) { + fs := fakeFS{ + "/sys/fs/cgroup/memory/memory.limit_in_bytes": "9223372036854771712", + "/sys/fs/cgroup/memory/docker/abc/memory.limit_in_bytes": "9223372036854771712", + "/proc/self/cgroup": "12:memory:/docker/abc", + } + _, _, ok := detectLinuxLimit(fs.read) + if ok { + t.Errorf("expected ok=false when v1 reports the unlimited sentinel everywhere") + } +} + +func TestDetectLinuxLimitCgroupV1NonDefaultPageSize(t *testing.T) { + // On 16 KiB- and 64 KiB-page kernels (some arm64 distros, ppc64le) + // the v1 unlimited sentinel is LONG_MAX page-aligned to the local + // page size, not the 4 KiB value 0x7FFFFFFFFFFFF000. Both must + // still be treated as unlimited. + fs := fakeFS{ + "/sys/fs/cgroup/memory/memory.limit_in_bytes": "9223372036854710272", // 64 KiB-page sentinel + "/sys/fs/cgroup/memory/foo/memory.limit_in_bytes": "9223372036854767616", // 16 KiB-page sentinel + "/proc/self/cgroup": "12:memory:/foo", + } + _, _, ok := detectLinuxLimit(fs.read) + if ok { + t.Errorf("expected ok=false: both values are page-aligned LONG_MAX for non-4KiB page sizes") + } +} + +func TestDetectLinuxLimitCgroupV2ZeroWalksUp(t *testing.T) { + // A leaf with memory.max="0" is degenerate (kernel kills any + // allocation) but legal. We should walk up the same way we do + // for "max" and pick up an ancestor's real limit. + fs := fakeFS{ + "/sys/fs/cgroup/cgroup.controllers": "memory cpu io", + "/sys/fs/cgroup/memory.max": "max", + "/sys/fs/cgroup/system.slice/memory.max": "536870912", + "/sys/fs/cgroup/system.slice/geth.service/memory.max": "0", + "/proc/self/cgroup": "0::/system.slice/geth.service", + } + bytes, _, ok := detectLinuxLimit(fs.read) + if !ok || bytes != 536870912 { + t.Errorf("got (%d, ok=%v), want (536870912, true)", bytes, ok) + } +} + +func TestDetectLinuxLimitCgroupV1CombinedControllers(t *testing.T) { + // Some kernels list multiple controllers per line. + fs := fakeFS{ + "/sys/fs/cgroup/memory/memory.limit_in_bytes": "9223372036854771712", + "/sys/fs/cgroup/memory/foo/memory.limit_in_bytes": "2147483648", + "/proc/self/cgroup": "8:cpu,memory,blkio:/foo", + } + bytes, _, ok := detectLinuxLimit(fs.read) + if !ok || bytes != 2147483648 { + t.Errorf("got (%d, ok=%v), want (2147483648, true)", bytes, ok) + } +} + +func TestDetectLinuxLimitNoCgroup(t *testing.T) { + fs := fakeFS{ + "/proc/self/cgroup": "0::/", + } + _, _, ok := detectLinuxLimit(fs.read) + if ok { + t.Errorf("expected ok=false when neither v1 nor v2 is mounted") + } +} diff --git a/internal/memlimit/probe_other.go b/internal/memlimit/probe_other.go new file mode 100644 index 0000000000..2f03cb6691 --- /dev/null +++ b/internal/memlimit/probe_other.go @@ -0,0 +1,28 @@ +// Copyright 2026 The go-ethereum Authors +// This file is part of the go-ethereum library. +// +// The go-ethereum library is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// The go-ethereum library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with the go-ethereum library. If not, see . + +//go:build !linux + +package memlimit + +// platformLimit on non-Linux returns ok=false; the caller falls back +// to total system memory. macOS has no equivalent kernel feature, and +// Windows job objects are not yet probed here. Docker Desktop on macOS +// and Windows runs containers inside a Linux VM, where the cgroup path +// in probe_linux.go applies as usual. +func platformLimit() (uint64, Source, bool) { + return 0, "", false +} diff --git a/internal/memlimit/probe_test.go b/internal/memlimit/probe_test.go new file mode 100644 index 0000000000..ce6ac279fe --- /dev/null +++ b/internal/memlimit/probe_test.go @@ -0,0 +1,30 @@ +// Copyright 2026 The go-ethereum Authors +// This file is part of the go-ethereum library. +// +// The go-ethereum library is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// The go-ethereum library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with the go-ethereum library. If not, see . + +package memlimit + +import "testing" + +// TestLimitSmoke asserts that Limit() returns a non-zero value on +// any host the test suite could plausibly run on. This catches +// regressions in the gopsutil fallback path that the OS-specific +// tests (which use a fake reader) would miss. +func TestLimitSmoke(t *testing.T) { + bytes, src := Limit() + if bytes == 0 { + t.Errorf("Limit() returned 0 bytes (source=%s); expected non-zero on any sane host", src) + } +}