From 6e62cc5aa85249454fa80fba01a996864eee2483 Mon Sep 17 00:00:00 2001 From: Jonny Rhea <5555162+jrhea@users.noreply.github.com> Date: Tue, 16 Jun 2026 07:47:05 -0500 Subject: [PATCH] core/vm: compute stack operations in place (#35156) The stack primitives pop by value: pop() returns the 32-byte value itself, so every popped operand is copied out of the stack arena before it is used. The result side was already in place, peek returns a pointer and binary ops write into the new stack top. This PR fixes the operand side: pointer-returning primitives (popPtr, popPtrPeek, etc), with the handlers rewritten to read operands directly from their arena slots. Every popped operand paid the copy, whatever the op went on to do with it, so this optimization covers the arithmetic and comparison ops as much as JUMP, MSTORE, SSTORE and RETURN. The copy is visible in the assembly. On arm64, master's opLt spends four instructions moving the popped value through the frame, and the comparison then reads it back from there: LDP (R5), (R6, R7) ; load words 0 and 1 of the popped value from the arena LDP 16(R5), (R5, R8) ; load words 2 and 3 STP (R6, R7), vm.~r0-64(SP) ; store words 0 and 1 into a frame slot STP (R5, R8), vm.~r0-48(SP) ; store words 2 and 3 With popPtrPeek those four instructions are gone, the frame shrinks from locals=0x58 to locals=0x18, and the function from 336 to 288 bytes. The compiler cannot remove the copy itself: uint256.Int is a four-element array, and Go's SSA does not promote arrays longer than one element to registers, so a by-value pop pays this round trip no matter how far inlining gets, for LT exactly as for ADD. The CALL and CREATE families are deliberately not converted: a child frame reuses the same stack arena, so parent pointers into popped slots die when the child pushes. The rule is recorded on the primitives: pointers stay valid until the next push or any sub call. Converting the call family safely means materializing scalars before the child call, left for later work with a call-heavy benchmark to justify it. ### Benchmarks Measured with the benchmark suite from #35144 (the evm-bench contract workloads and the block import benchmark), which is not part of this PR's diff. Apple M4 Max, fixed iteration counts, n=10, all p=0.000. B/op and allocs/op are statistically identical on every benchmark: | benchmark | master | PR | vs master | |---|---|---|---| | Snailtracer | 60.0 ms | 54.1 ms | -9.8% | | TenThousandHashes | 13.2 ms | 12.2 ms | -7.8% | | ERC20Transfer | 11.7 ms | 11.0 ms | -5.5% | | ERC20Mint | 7.49 ms | 7.02 ms | -6.2% | | ERC20ApprovalTransfer | 8.92 ms | 8.44 ms | -5.4% | This PR is independent of #35144 but plays nicely with it: the generated dispatch there splices these handler bodies, so the in-place forms land in its fast path too, where they measure larger. ### Testing The rewritten handlers run on the interpreter's only execution path, so correctness rests on references outside the change: - **Consensus fixtures.** The full tests package passes: state tests, the execution-spec families, blockchain tests. - **Opcode testcases.** The JSON testcases compare individual opcode results against committed expected values. - **Tracer fixtures.** The tracetest reference files pin exact log and return data shapes, covering the rewritten LOG and RETURN paths. - **Cross-build differential.** A goevmlab campaign running this branch's evm against master's evm over generated state tests across four forks (Prague, Cancun, London, Osaka) with full trace comparison: 160,566 tests, zero divergences. --------- Co-authored-by: MariusVanDerWijden --- core/vm/eips.go | 16 ++--- core/vm/instructions.go | 136 ++++++++++++++++++---------------------- core/vm/stack.go | 56 +++++++++++++++++ 3 files changed, 120 insertions(+), 88 deletions(-) diff --git a/core/vm/eips.go b/core/vm/eips.go index ba7cbd7461..f8473e65e8 100644 --- a/core/vm/eips.go +++ b/core/vm/eips.go @@ -212,8 +212,7 @@ func opTstore(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { if evm.readOnly { return nil, ErrWriteProtection } - loc := scope.Stack.pop() - val := scope.Stack.pop() + loc, val := scope.Stack.pop2() evm.StateDB.SetTransientState(scope.Contract.Address(), loc.Bytes32(), val.Bytes32()) return nil, nil } @@ -263,11 +262,7 @@ func enable5656(jt *JumpTable) { // opMcopy implements the MCOPY opcode (https://eips.ethereum.org/EIPS/eip-5656) func opMcopy(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { - var ( - dst = scope.Stack.pop() - src = scope.Stack.pop() - length = scope.Stack.pop() - ) + dst, src, length := scope.Stack.pop3() // These values are checked for overflow during memory expansion calculation // (the memorySize function on the opcode). scope.Memory.Copy(dst.Uint64(), src.Uint64(), length.Uint64()) @@ -364,11 +359,8 @@ func enable8024(jt *JumpTable) { func opExtCodeCopyEIP4762(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { var ( - stack = scope.Stack - a = stack.pop() - memOffset = stack.pop() - codeOffset = stack.pop() - length = stack.pop() + stack = scope.Stack + a, memOffset, codeOffset, length = stack.pop4() ) uint64CodeOffset, overflow := codeOffset.Uint64WithOverflow() if overflow { diff --git a/core/vm/instructions.go b/core/vm/instructions.go index 92c363a356..209457f670 100644 --- a/core/vm/instructions.go +++ b/core/vm/instructions.go @@ -27,56 +27,56 @@ import ( ) func opAdd(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { - x, y := scope.Stack.pop(), scope.Stack.peek() - y.Add(&x, y) + x, y := scope.Stack.pop1Peek1() + y.Add(x, y) return nil, nil } func opSub(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { - x, y := scope.Stack.pop(), scope.Stack.peek() - y.Sub(&x, y) + x, y := scope.Stack.pop1Peek1() + y.Sub(x, y) return nil, nil } func opMul(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { - x, y := scope.Stack.pop(), scope.Stack.peek() - y.Mul(&x, y) + x, y := scope.Stack.pop1Peek1() + y.Mul(x, y) return nil, nil } func opDiv(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { - x, y := scope.Stack.pop(), scope.Stack.peek() - y.Div(&x, y) + x, y := scope.Stack.pop1Peek1() + y.Div(x, y) return nil, nil } func opSdiv(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { - x, y := scope.Stack.pop(), scope.Stack.peek() - y.SDiv(&x, y) + x, y := scope.Stack.pop1Peek1() + y.SDiv(x, y) return nil, nil } func opMod(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { - x, y := scope.Stack.pop(), scope.Stack.peek() - y.Mod(&x, y) + x, y := scope.Stack.pop1Peek1() + y.Mod(x, y) return nil, nil } func opSmod(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { - x, y := scope.Stack.pop(), scope.Stack.peek() - y.SMod(&x, y) + x, y := scope.Stack.pop1Peek1() + y.SMod(x, y) return nil, nil } func opExp(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { - base, exponent := scope.Stack.pop(), scope.Stack.peek() - exponent.Exp(&base, exponent) + base, exponent := scope.Stack.pop1Peek1() + exponent.Exp(base, exponent) return nil, nil } func opSignExtend(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { - back, num := scope.Stack.pop(), scope.Stack.peek() - num.ExtendSign(num, &back) + back, num := scope.Stack.pop1Peek1() + num.ExtendSign(num, back) return nil, nil } @@ -87,7 +87,7 @@ func opNot(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { } func opLt(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { - x, y := scope.Stack.pop(), scope.Stack.peek() + x, y := scope.Stack.pop1Peek1() if x.Lt(y) { y.SetOne() } else { @@ -97,7 +97,7 @@ func opLt(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { } func opGt(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { - x, y := scope.Stack.pop(), scope.Stack.peek() + x, y := scope.Stack.pop1Peek1() if x.Gt(y) { y.SetOne() } else { @@ -107,7 +107,7 @@ func opGt(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { } func opSlt(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { - x, y := scope.Stack.pop(), scope.Stack.peek() + x, y := scope.Stack.pop1Peek1() if x.Slt(y) { y.SetOne() } else { @@ -117,7 +117,7 @@ func opSlt(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { } func opSgt(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { - x, y := scope.Stack.pop(), scope.Stack.peek() + x, y := scope.Stack.pop1Peek1() if x.Sgt(y) { y.SetOne() } else { @@ -127,7 +127,7 @@ func opSgt(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { } func opEq(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { - x, y := scope.Stack.pop(), scope.Stack.peek() + x, y := scope.Stack.pop1Peek1() if x.Eq(y) { y.SetOne() } else { @@ -147,38 +147,38 @@ func opIszero(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { } func opAnd(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { - x, y := scope.Stack.pop(), scope.Stack.peek() - y.And(&x, y) + x, y := scope.Stack.pop1Peek1() + y.And(x, y) return nil, nil } func opOr(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { - x, y := scope.Stack.pop(), scope.Stack.peek() - y.Or(&x, y) + x, y := scope.Stack.pop1Peek1() + y.Or(x, y) return nil, nil } func opXor(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { - x, y := scope.Stack.pop(), scope.Stack.peek() - y.Xor(&x, y) + x, y := scope.Stack.pop1Peek1() + y.Xor(x, y) return nil, nil } func opByte(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { - th, val := scope.Stack.pop(), scope.Stack.peek() - val.Byte(&th) + th, val := scope.Stack.pop1Peek1() + val.Byte(th) return nil, nil } func opAddmod(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { - x, y, z := scope.Stack.pop(), scope.Stack.pop(), scope.Stack.peek() - z.AddMod(&x, &y, z) + x, y, z := scope.Stack.pop2Peek1() + z.AddMod(x, y, z) return nil, nil } func opMulmod(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { - x, y, z := scope.Stack.pop(), scope.Stack.pop(), scope.Stack.peek() - z.MulMod(&x, &y, z) + x, y, z := scope.Stack.pop2Peek1() + z.MulMod(x, y, z) return nil, nil } @@ -187,7 +187,7 @@ func opMulmod(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { // and pushes on the stack arg2 shifted to the left by arg1 number of bits. func opSHL(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { // Note, second operand is left in the stack; accumulate result into it, and no need to push it afterwards - shift, value := scope.Stack.pop(), scope.Stack.peek() + shift, value := scope.Stack.pop1Peek1() if shift.LtUint64(256) { value.Lsh(value, uint(shift.Uint64())) } else { @@ -201,7 +201,7 @@ func opSHL(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { // and pushes on the stack arg2 shifted to the right by arg1 number of bits with zero fill. func opSHR(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { // Note, second operand is left in the stack; accumulate result into it, and no need to push it afterwards - shift, value := scope.Stack.pop(), scope.Stack.peek() + shift, value := scope.Stack.pop1Peek1() if shift.LtUint64(256) { value.Rsh(value, uint(shift.Uint64())) } else { @@ -214,7 +214,7 @@ func opSHR(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { // The SAR instruction (arithmetic shift right) pops 2 values from the stack, first arg1 and then arg2, // and pushes on the stack arg2 shifted to the right by arg1 number of bits with sign extension. func opSAR(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { - shift, value := scope.Stack.pop(), scope.Stack.peek() + shift, value := scope.Stack.pop1Peek1() if shift.GtUint64(256) { if value.Sign() >= 0 { value.Clear() @@ -230,7 +230,7 @@ func opSAR(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { } func opKeccak256(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { - offset, size := scope.Stack.pop(), scope.Stack.peek() + offset, size := scope.Stack.pop1Peek1() data := scope.Memory.GetPtr(offset.Uint64(), size.Uint64()) hash := crypto.Keccak256Hash(data) @@ -286,11 +286,7 @@ func opCallDataSize(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { } func opCallDataCopy(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { - var ( - memOffset = scope.Stack.pop() - dataOffset = scope.Stack.pop() - length = scope.Stack.pop() - ) + memOffset, dataOffset, length := scope.Stack.pop3() dataOffset64, overflow := dataOffset.Uint64WithOverflow() if overflow { dataOffset64 = math.MaxUint64 @@ -309,11 +305,7 @@ func opReturnDataSize(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) } func opReturnDataCopy(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { - var ( - memOffset = scope.Stack.pop() - dataOffset = scope.Stack.pop() - length = scope.Stack.pop() - ) + memOffset, dataOffset, length := scope.Stack.pop3() offset64, overflow := dataOffset.Uint64WithOverflow() if overflow { @@ -321,7 +313,7 @@ func opReturnDataCopy(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) } // we can reuse dataOffset now (aliasing it for clarity) var end = dataOffset - end.Add(&dataOffset, &length) + end.Add(dataOffset, length) end64, overflow := end.Uint64WithOverflow() if overflow || uint64(len(evm.returnData)) < end64 { return nil, ErrReturnDataOutOfBounds @@ -342,11 +334,7 @@ func opCodeSize(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { } func opCodeCopy(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { - var ( - memOffset = scope.Stack.pop() - codeOffset = scope.Stack.pop() - length = scope.Stack.pop() - ) + memOffset, codeOffset, length := scope.Stack.pop3() uint64CodeOffset, overflow := codeOffset.Uint64WithOverflow() if overflow { uint64CodeOffset = math.MaxUint64 @@ -359,11 +347,8 @@ func opCodeCopy(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { func opExtCodeCopy(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { var ( - stack = scope.Stack - a = stack.pop() - memOffset = stack.pop() - codeOffset = stack.pop() - length = stack.pop() + stack = scope.Stack + a, memOffset, codeOffset, length = stack.pop4() ) uint64CodeOffset, overflow := codeOffset.Uint64WithOverflow() if overflow { @@ -480,7 +465,7 @@ func opGasLimit(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { } func opPop(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { - scope.Stack.pop() + scope.Stack.drop() return nil, nil } @@ -492,13 +477,13 @@ func opMload(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { } func opMstore(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { - mStart, val := scope.Stack.pop(), scope.Stack.pop() - scope.Memory.Set32(mStart.Uint64(), &val) + mStart, val := scope.Stack.pop2() + scope.Memory.Set32(mStart.Uint64(), val) return nil, nil } func opMstore8(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { - off, val := scope.Stack.pop(), scope.Stack.pop() + off, val := scope.Stack.pop2() scope.Memory.store[off.Uint64()] = byte(val.Uint64()) return nil, nil } @@ -515,8 +500,7 @@ func opSstore(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { if evm.readOnly { return nil, ErrWriteProtection } - loc := scope.Stack.pop() - val := scope.Stack.pop() + loc, val := scope.Stack.pop2() evm.StateDB.SetState(scope.Contract.Address(), loc.Bytes32(), val.Bytes32()) return nil, nil } @@ -525,8 +509,8 @@ func opJump(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { if evm.abort.Load() { return nil, errStopToken } - pos := scope.Stack.pop() - if !scope.Contract.validJumpdest(&pos) { + pos := scope.Stack.pop1() + if !scope.Contract.validJumpdest(pos) { return nil, ErrInvalidJump } *pc = pos.Uint64() - 1 // pc will be increased by the interpreter loop @@ -537,9 +521,9 @@ func opJumpi(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { if evm.abort.Load() { return nil, errStopToken } - pos, cond := scope.Stack.pop(), scope.Stack.pop() + pos, cond := scope.Stack.pop2() if !cond.IsZero() { - if !scope.Contract.validJumpdest(&pos) { + if !scope.Contract.validJumpdest(pos) { return nil, ErrInvalidJump } *pc = pos.Uint64() - 1 // pc will be increased by the interpreter loop @@ -864,14 +848,14 @@ func opStaticCall(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { } func opReturn(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { - offset, size := scope.Stack.pop(), scope.Stack.pop() + offset, size := scope.Stack.pop2() ret := scope.Memory.GetCopy(offset.Uint64(), size.Uint64()) return ret, errStopToken } func opRevert(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { - offset, size := scope.Stack.pop(), scope.Stack.pop() + offset, size := scope.Stack.pop2() ret := scope.Memory.GetCopy(offset.Uint64(), size.Uint64()) evm.returnData = ret @@ -893,7 +877,7 @@ func opSelfdestruct(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, error) { var ( this = scope.Contract.Address() balance = evm.StateDB.GetBalance(this) - top = scope.Stack.pop() + top = scope.Stack.pop1() beneficiary = common.Address(top.Bytes20()) ) // The funds are burned immediately if the beneficiary is the caller itself, @@ -923,7 +907,7 @@ func opSelfdestruct6780(pc *uint64, evm *EVM, scope *ScopeContext) ([]byte, erro var ( this = scope.Contract.Address() balance = evm.StateDB.GetBalance(this) - top = scope.Stack.pop() + top = scope.Stack.pop1() beneficiary = common.Address(top.Bytes20()) newContract = evm.StateDB.IsNewContract(this) ) @@ -1092,9 +1076,9 @@ func makeLog(size int) executionFunc { } topics := make([]common.Hash, size) stack := scope.Stack - mStart, mSize := stack.pop(), stack.pop() + mStart, mSize := stack.pop2() for i := 0; i < size; i++ { - addr := stack.pop() + addr := stack.pop1() topics[i] = addr.Bytes32() } diff --git a/core/vm/stack.go b/core/vm/stack.go index d8000bc86d..564345ccd8 100644 --- a/core/vm/stack.go +++ b/core/vm/stack.go @@ -121,6 +121,62 @@ func (s *Stack) len() int { return s.size } +// drop removes the top element without reading it. +func (s *Stack) drop() { + s.inner.top-- + s.size-- +} + +// pop1 removes the top element and returns a pointer to it. The pointer +// stays valid only until the next push or sub call. +func (s *Stack) pop1() *uint256.Int { + s.inner.top-- + s.size-- + return &s.inner.data[s.inner.top] +} + +// pop2 removes the top two elements and returns pointers to them. The +// pointers stay valid only until the next push or sub call. +func (s *Stack) pop2() (top, second *uint256.Int) { + s.inner.top -= 2 + s.size -= 2 + return &s.inner.data[s.inner.top+1], &s.inner.data[s.inner.top] +} + +// pop3 removes the top three elements and returns pointers to them. The +// pointers stay valid only until the next push or sub call. +func (s *Stack) pop3() (top, second, third *uint256.Int) { + s.inner.top -= 3 + s.size -= 3 + return &s.inner.data[s.inner.top+2], &s.inner.data[s.inner.top+1], &s.inner.data[s.inner.top] +} + +// pop4 removes the top four elements and returns pointers to them. The +// pointers stay valid only until the next push or sub call. +func (s *Stack) pop4() (top, second, third, fourth *uint256.Int) { + s.inner.top -= 4 + s.size -= 4 + return &s.inner.data[s.inner.top+3], &s.inner.data[s.inner.top+2], &s.inner.data[s.inner.top+1], &s.inner.data[s.inner.top] +} + +// pop1Peek1 removes the top element and returns pointers to it and to the new +// top, the usual operand and write target of a binary operation. The first +// pointer stays valid only until the next push or sub call. +func (s *Stack) pop1Peek1() (top, rest *uint256.Int) { + s.inner.top-- + s.size-- + return &s.inner.data[s.inner.top], &s.inner.data[s.inner.top-1] +} + +// pop2Peek1 removes the top two elements and returns pointers to them and to +// the new top, for three operand operations. The first two pointers stay +// valid only until the next push or sub call. +func (s *Stack) pop2Peek1() (top, second, rest *uint256.Int) { + s.inner.top -= 2 + s.size -= 2 + return &s.inner.data[s.inner.top+1], &s.inner.data[s.inner.top], &s.inner.data[s.inner.top-1] +} + func (s *Stack) swap1() { s.inner.data[s.bottom+s.size-2], s.inner.data[s.bottom+s.size-1] = s.inner.data[s.bottom+s.size-1], s.inner.data[s.bottom+s.size-2] }