internal/memlimit: respect cgroup memory cap (#34947)

Currently geth ignores the docker `--memory` directive and doesn't
adjust its cache size downward when necessary, potentially running into
OOM.

while gopsutil has functions like `docker/CgroupMem()` they are rather
for reading cgroup memory limit of a container from the host.
This commit is contained in:
Sina M 2026-06-12 13:31:15 +02:00 committed by GitHub
parent 906727089b
commit d93dda49c8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 452 additions and 11 deletions

View file

@ -58,6 +58,7 @@ import (
"github.com/ethereum/go-ethereum/graphql"
"github.com/ethereum/go-ethereum/internal/ethapi"
"github.com/ethereum/go-ethereum/internal/flags"
"github.com/ethereum/go-ethereum/internal/memlimit"
"github.com/ethereum/go-ethereum/log"
"github.com/ethereum/go-ethereum/metrics"
"github.com/ethereum/go-ethereum/metrics/exp"
@ -74,7 +75,6 @@ import (
"github.com/ethereum/go-ethereum/triedb/hashdb"
"github.com/ethereum/go-ethereum/triedb/pathdb"
pcsclite "github.com/gballet/go-libpcsclite"
gopsutil "github.com/shirou/gopsutil/mem"
"github.com/urfave/cli/v2"
)
@ -1726,16 +1726,18 @@ func SetEthConfig(ctx *cli.Context, stack *node.Node, cfg *ethconfig.Config) {
setMiner(ctx, &cfg.Miner)
setRequiredBlocks(ctx, cfg)
// Cap the cache allowance and tune the garbage collector
mem, err := gopsutil.VirtualMemory()
if err == nil {
if 32<<(^uintptr(0)>>63) == 32 && mem.Total > 2*1024*1024*1024 {
log.Warn("Lowering memory allowance on 32bit arch", "available", mem.Total/1024/1024, "addressable", 2*1024)
mem.Total = 2 * 1024 * 1024 * 1024
// Cap the cache allowance and tune the garbage collector against
// the effective memory limit (cgroup-imposed when running in a
// container, total system memory otherwise).
total, source := memlimit.Limit()
if total > 0 {
if 32<<(^uintptr(0)>>63) == 32 && total > 2*1024*1024*1024 {
log.Warn("Lowering memory allowance on 32bit arch", "available", total/1024/1024, "addressable", 2*1024)
total = 2 * 1024 * 1024 * 1024
}
allowance := int(mem.Total / 1024 / 1024 / 3)
allowance := int(total / 1024 / 1024 / 3)
if cache := ctx.Int(CacheFlag.Name); cache > allowance {
log.Warn("Sanitizing cache to Go's GC limits", "provided", cache, "updated", allowance)
log.Warn("Sanitizing cache to Go's GC limits", "source", source, "provided", cache, "updated", allowance)
ctx.Set(CacheFlag.Name, strconv.Itoa(allowance))
}
}
@ -1750,14 +1752,14 @@ func SetEthConfig(ctx *cli.Context, stack *node.Node, cfg *ethconfig.Config) {
cfg.SyncMode = ethconfig.FullSync // dev sync target forces full sync
} else if ctx.IsSet(SyncModeFlag.Name) {
value := ctx.String(SyncModeFlag.Name)
if err = cfg.SyncMode.UnmarshalText([]byte(value)); err != nil {
if err := cfg.SyncMode.UnmarshalText([]byte(value)); err != nil {
Fatalf("--%v: %v", SyncModeFlag.Name, err)
}
}
if ctx.IsSet(ChainHistoryFlag.Name) {
value := ctx.String(ChainHistoryFlag.Name)
if err = cfg.HistoryMode.UnmarshalText([]byte(value)); err != nil {
if err := cfg.HistoryMode.UnmarshalText([]byte(value)); err != nil {
Fatalf("--%s: %v", ChainHistoryFlag.Name, err)
}
}

View file

@ -0,0 +1,59 @@
// Copyright 2026 The go-ethereum Authors
// This file is part of the go-ethereum library.
//
// The go-ethereum library is free software: you can redistribute it and/or modify
// it under the terms of the GNU Lesser General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// The go-ethereum library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
// Package memlimit detects the effective memory limit of the current
// process. On Linux the cgroup limit is consulted first, with total
// system memory as the fallback on all platforms.
package memlimit
import (
gopsutil "github.com/shirou/gopsutil/mem"
)
// Source identifies which mechanism produced the limit value.
type Source int
const (
SourceUnknown Source = iota
SourceCgroupV2
SourceCgroupV1
SourceSystem
)
func (s Source) String() string {
switch s {
case SourceCgroupV2:
return "cgroup-v2"
case SourceCgroupV1:
return "cgroup-v1"
case SourceSystem:
return "system"
default:
return "unknown"
}
}
// Limit returns the memory limit visible to this process in bytes and
// the source that produced it.
func Limit() (bytes uint64, source Source) {
if v, src, ok := platformLimit(); ok {
return v, src
}
if mem, err := gopsutil.VirtualMemory(); err == nil {
return mem.Total, SourceSystem
}
return 0, SourceUnknown
}

View file

@ -0,0 +1,155 @@
// Copyright 2026 The go-ethereum Authors
// This file is part of the go-ethereum library.
//
// The go-ethereum library is free software: you can redistribute it and/or modify
// it under the terms of the GNU Lesser General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// The go-ethereum library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
//go:build linux
package memlimit
import (
"os"
"path"
"strconv"
"strings"
)
// cgroupV1UnlimitedThreshold detects the v1 "no limit" sentinel, which
// is LONG_MAX rounded down to the kernel page size. Anything above 1<<62
// is treated as unlimited regardless of page size.
const cgroupV1UnlimitedThreshold = uint64(1) << 62
// fileReader abstracts os.ReadFile for testing.
type fileReader func(path string) ([]byte, error)
// platformLimit returns the cgroup limit (v2 memory.max or v1
// memory.limit_in_bytes) of the current process. The cgroup limit is
// the authoritative budget in a container, where /proc/meminfo
// reports the host's RAM.
func platformLimit() (uint64, Source, bool) {
if v, ok := cgroupV2Limit(os.ReadFile); ok {
return v, SourceCgroupV2, true
}
if v, ok := cgroupV1Limit(os.ReadFile); ok {
return v, SourceCgroupV1, true
}
return 0, SourceUnknown, false
}
// cgroupV2Limit reads the cgroup v2 memory.max for the current process.
// It probes /sys/fs/cgroup directly first (the effective root inside a
// cgroup-namespaced container), then the path from /proc/self/cgroup
// for the bare-metal case where the limit sits on a systemd slice.
func cgroupV2Limit(read fileReader) (uint64, bool) {
if v, ok := readCgroupV2At("/sys/fs/cgroup", "/", read); ok {
return v, true
}
procPath, ok := readProcSelfCgroupV2(read)
if !ok || procPath == "/" {
return 0, false
}
return readCgroupV2At("/sys/fs/cgroup", procPath, read)
}
// readCgroupV2At reads memory.max under root+rel, walking up parents
// until a numeric value is found or the path bottoms out.
func readCgroupV2At(root, rel string, read fileReader) (uint64, bool) {
// cgroup.controllers exists only on v2; if absent, v2 is not mounted here.
if _, err := read(path.Join(root, "cgroup.controllers")); err != nil {
return 0, false
}
for {
raw, err := read(path.Join(root, rel, "memory.max"))
if err == nil {
s := strings.TrimSpace(string(raw))
if s != "max" {
// Zero is legal to write but degenerate; treat it like
// "max" and keep walking up.
if n, err := strconv.ParseUint(s, 10, 64); err == nil && n != 0 {
return n, true
}
}
}
if rel == "/" || rel == "" {
return 0, false
}
rel = path.Dir(rel)
}
}
// readProcSelfCgroupV2 returns the cgroup path from the v2 line
// ("0::<path>") of /proc/self/cgroup.
func readProcSelfCgroupV2(read fileReader) (string, bool) {
raw, err := read("/proc/self/cgroup")
if err != nil {
return "", false
}
for line := range strings.SplitSeq(strings.TrimSpace(string(raw)), "\n") {
// v2 unified line: "0::<path>"
if strings.HasPrefix(line, "0::") {
return strings.TrimPrefix(line, "0::"), true
}
}
return "", false
}
// cgroupV1Limit reads memory.limit_in_bytes from the v1 memory
// controller, walking up parents when a node reports the unlimited
// sentinel.
func cgroupV1Limit(read fileReader) (uint64, bool) {
rel, ok := readProcSelfCgroupV1Memory(read)
if !ok {
return 0, false
}
root := "/sys/fs/cgroup/memory"
if _, err := read(path.Join(root, "memory.limit_in_bytes")); err != nil {
return 0, false
}
for {
raw, err := read(path.Join(root, rel, "memory.limit_in_bytes"))
if err == nil {
if n, err := strconv.ParseUint(strings.TrimSpace(string(raw)), 10, 64); err == nil {
if n != 0 && n < cgroupV1UnlimitedThreshold {
return n, true
}
}
}
if rel == "/" || rel == "" {
return 0, false
}
rel = path.Dir(rel)
}
}
// readProcSelfCgroupV1Memory parses /proc/self/cgroup for the v1 memory
// controller line ("<id>:memory:<path>" or "<id>:...,memory,...:<path>").
func readProcSelfCgroupV1Memory(read fileReader) (string, bool) {
raw, err := read("/proc/self/cgroup")
if err != nil {
return "", false
}
for line := range strings.SplitSeq(strings.TrimSpace(string(raw)), "\n") {
// Format: "<hierarchy-id>:<comma-separated-controllers>:<path>"
parts := strings.SplitN(line, ":", 3)
if len(parts) != 3 {
continue
}
for ctrl := range strings.SplitSeq(parts[1], ",") {
if ctrl == "memory" {
return parts[2], true
}
}
}
return "", false
}

View file

@ -0,0 +1,172 @@
// Copyright 2026 The go-ethereum Authors
// This file is part of the go-ethereum library.
//
// The go-ethereum library is free software: you can redistribute it and/or modify
// it under the terms of the GNU Lesser General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// The go-ethereum library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
//go:build linux
package memlimit
import (
"os"
"testing"
)
// fakeFS is a fileReader backed by an in-memory map. Missing keys
// return os.ErrNotExist.
type fakeFS map[string]string
func (f fakeFS) read(path string) ([]byte, error) {
v, ok := f[path]
if !ok {
return nil, os.ErrNotExist
}
return []byte(v), nil
}
func TestCgroupV2Container(t *testing.T) {
// Namespaced container (Docker default since 20.10): memory.max
// sits directly at the cgroup root.
fs := fakeFS{
"/sys/fs/cgroup/cgroup.controllers": "memory cpu io",
"/sys/fs/cgroup/memory.max": "536870912",
"/proc/self/cgroup": "0::/",
}
bytes, ok := cgroupV2Limit(fs.read)
if !ok || bytes != 536870912 {
t.Errorf("got (%d, %v), want (536870912, true)", bytes, ok)
}
}
func TestCgroupV2Unlimited(t *testing.T) {
fs := fakeFS{
"/sys/fs/cgroup/cgroup.controllers": "memory cpu io",
"/sys/fs/cgroup/memory.max": "max",
"/proc/self/cgroup": "0::/",
}
if _, ok := cgroupV2Limit(fs.read); ok {
t.Errorf("expected ok=false for fully unlimited v2 hierarchy")
}
}
func TestCgroupV2LimitOnAncestor(t *testing.T) {
// Bare-metal systemd: the leaf has no limit but the containing
// slice does.
fs := fakeFS{
"/sys/fs/cgroup/cgroup.controllers": "memory cpu io",
"/sys/fs/cgroup/memory.max": "max",
"/sys/fs/cgroup/system.slice/memory.max": "8589934592",
"/sys/fs/cgroup/system.slice/geth.service/memory.max": "max",
"/proc/self/cgroup": "0::/system.slice/geth.service",
}
bytes, ok := cgroupV2Limit(fs.read)
if !ok || bytes != 8589934592 {
t.Errorf("got (%d, %v), want (8589934592, true)", bytes, ok)
}
}
func TestCgroupV2PrefersDirectRoot(t *testing.T) {
// In a namespaced container /proc/self/cgroup may show a host-side
// path; the direct probe at the root must win.
fs := fakeFS{
"/sys/fs/cgroup/cgroup.controllers": "memory cpu io",
"/sys/fs/cgroup/memory.max": "536870912",
"/sys/fs/cgroup/system.slice/memory.max": "max",
"/proc/self/cgroup": "0::/system.slice/docker-abc.scope",
}
bytes, ok := cgroupV2Limit(fs.read)
if !ok || bytes != 536870912 {
t.Errorf("got (%d, ok=%v), want (536870912, true)", bytes, ok)
}
}
func TestCgroupV2ZeroWalksUp(t *testing.T) {
// memory.max="0" is legal but degenerate; walk up like "max".
fs := fakeFS{
"/sys/fs/cgroup/cgroup.controllers": "memory cpu io",
"/sys/fs/cgroup/memory.max": "max",
"/sys/fs/cgroup/system.slice/memory.max": "536870912",
"/sys/fs/cgroup/system.slice/geth.service/memory.max": "0",
"/proc/self/cgroup": "0::/system.slice/geth.service",
}
bytes, ok := cgroupV2Limit(fs.read)
if !ok || bytes != 536870912 {
t.Errorf("got (%d, ok=%v), want (536870912, true)", bytes, ok)
}
}
func TestCgroupV1(t *testing.T) {
fs := fakeFS{
// no v2 hallmark file
"/sys/fs/cgroup/memory/memory.limit_in_bytes": "9223372036854771712",
"/sys/fs/cgroup/memory/docker/abc/memory.limit_in_bytes": "1073741824",
"/proc/self/cgroup": "12:memory:/docker/abc\n11:cpu:/docker/abc",
}
if _, ok := cgroupV2Limit(fs.read); ok {
t.Errorf("expected v2 probe to fail on a v1-only host")
}
bytes, ok := cgroupV1Limit(fs.read)
if !ok || bytes != 1073741824 {
t.Errorf("got (%d, %v), want (1073741824, true)", bytes, ok)
}
}
func TestCgroupV1Unlimited(t *testing.T) {
fs := fakeFS{
"/sys/fs/cgroup/memory/memory.limit_in_bytes": "9223372036854771712",
"/sys/fs/cgroup/memory/docker/abc/memory.limit_in_bytes": "9223372036854771712",
"/proc/self/cgroup": "12:memory:/docker/abc",
}
if _, ok := cgroupV1Limit(fs.read); ok {
t.Errorf("expected ok=false when v1 reports the unlimited sentinel everywhere")
}
}
func TestCgroupV1NonDefaultPageSize(t *testing.T) {
// The unlimited sentinel is LONG_MAX aligned to the local page
// size, so it differs on 16 KiB and 64 KiB page kernels.
fs := fakeFS{
"/sys/fs/cgroup/memory/memory.limit_in_bytes": "9223372036854710272", // 64 KiB-page sentinel
"/sys/fs/cgroup/memory/foo/memory.limit_in_bytes": "9223372036854767616", // 16 KiB-page sentinel
"/proc/self/cgroup": "12:memory:/foo",
}
if _, ok := cgroupV1Limit(fs.read); ok {
t.Errorf("expected ok=false: both values are page-aligned LONG_MAX")
}
}
func TestCgroupV1CombinedControllers(t *testing.T) {
// Some kernels list multiple controllers per line.
fs := fakeFS{
"/sys/fs/cgroup/memory/memory.limit_in_bytes": "9223372036854771712",
"/sys/fs/cgroup/memory/foo/memory.limit_in_bytes": "2147483648",
"/proc/self/cgroup": "8:cpu,memory,blkio:/foo",
}
bytes, ok := cgroupV1Limit(fs.read)
if !ok || bytes != 2147483648 {
t.Errorf("got (%d, ok=%v), want (2147483648, true)", bytes, ok)
}
}
func TestNoCgroup(t *testing.T) {
fs := fakeFS{
"/proc/self/cgroup": "0::/",
}
if _, ok := cgroupV2Limit(fs.read); ok {
t.Errorf("expected v2 ok=false when v2 is not mounted")
}
if _, ok := cgroupV1Limit(fs.read); ok {
t.Errorf("expected v1 ok=false when v1 is not mounted")
}
}

View file

@ -0,0 +1,25 @@
// Copyright 2026 The go-ethereum Authors
// This file is part of the go-ethereum library.
//
// The go-ethereum library is free software: you can redistribute it and/or modify
// it under the terms of the GNU Lesser General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// The go-ethereum library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
//go:build !linux
package memlimit
// platformLimit reports no platform-specific limit on non-Linux
// systems; the caller falls back to total system memory.
func platformLimit() (uint64, Source, bool) {
return 0, SourceUnknown, false
}

View file

@ -0,0 +1,28 @@
// Copyright 2026 The go-ethereum Authors
// This file is part of the go-ethereum library.
//
// The go-ethereum library is free software: you can redistribute it and/or modify
// it under the terms of the GNU Lesser General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// The go-ethereum library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
package memlimit
import "testing"
// TestLimitSmoke asserts that Limit() returns a non-zero value,
// exercising the real probe and the gopsutil fallback.
func TestLimitSmoke(t *testing.T) {
bytes, src := Limit()
if bytes == 0 {
t.Errorf("Limit() returned 0 bytes (source=%s); expected non-zero on any sane host", src)
}
}