Merge ee36b77785 into 12eabbd76d

2026-05-24 08:49:29 +00:00 · 2026-05-21 21:55:10 -07:00 · 2026-05-21 21:55:10 -07:00 · d8a8e7d0a1
commit d8a8e7d0a1
parent 12eabbd76d ee36b77785
22 changed files with 5762 additions and 6368 deletions
--- a/consensus/clique/clique.go
+++ b/consensus/clique/clique.go
@ -37,7 +37,6 @@ import (
 	"github.com/ethereum/go-ethereum/core/types/bal"
 	"github.com/ethereum/go-ethereum/core/vm"
 	"github.com/ethereum/go-ethereum/crypto"
-	"github.com/ethereum/go-ethereum/crypto/keccak"
 	"github.com/ethereum/go-ethereum/ethdb"
 	"github.com/ethereum/go-ethereum/log"
 	"github.com/ethereum/go-ethereum/params"
@ -627,9 +626,11 @@ func (c *Clique) Close() error {

 // SealHash returns the hash of a block prior to it being sealed.
 func SealHash(header *types.Header) (hash common.Hash) {
-	hasher := keccak.NewLegacyKeccak256()
+	hasher := crypto.NewKeccakState()
+	defer crypto.ReturnToPool(hasher)
+
 	encodeSigHeader(hasher, header)
-	hasher.(crypto.KeccakState).Read(hash[:])
+	hasher.Sum(hash[:0])
 	return hash
 }

--- a/core/types/bloom9.go
+++ b/core/types/bloom9.go
@ -23,7 +23,7 @@ import (

 	"github.com/ethereum/go-ethereum/common/bitutil"
 	"github.com/ethereum/go-ethereum/common/hexutil"
-	"github.com/ethereum/go-ethereum/crypto"
+	"github.com/ethereum/go-ethereum/crypto/keccak"
 )

 type bytesBacked interface {
@ -141,7 +141,7 @@ func Bloom9(data []byte) []byte {

 // bloomValues returns the bytes (index-value pairs) to set for the given data
 func bloomValues(data []byte, hashbuf *[6]byte) (uint, byte, uint, byte, uint, byte) {
-	sha := hasherPool.Get().(crypto.KeccakState)
+	sha := hasherPool.Get().(keccak.KeccakState)
 	sha.Reset()
 	sha.Write(data)
 	sha.Read(hashbuf[:])
--- a/core/types/hashing.go
+++ b/core/types/hashing.go
@ -24,6 +24,7 @@ import (

 	"github.com/ethereum/go-ethereum/common"
 	"github.com/ethereum/go-ethereum/crypto"
+	"github.com/ethereum/go-ethereum/crypto/keccak"
 	"github.com/ethereum/go-ethereum/rlp"
 )

@ -55,7 +56,7 @@ func getPooledBuffer(size uint64) ([]byte, *bytes.Buffer, error) {

 // rlpHash encodes x and hashes the encoded bytes.
 func rlpHash(x interface{}) (h common.Hash) {
-	sha := hasherPool.Get().(crypto.KeccakState)
+	sha := hasherPool.Get().(keccak.KeccakState)
 	defer hasherPool.Put(sha)
 	sha.Reset()
 	rlp.Encode(sha, x)
@ -66,7 +67,7 @@ func rlpHash(x interface{}) (h common.Hash) {
 // prefixedRlpHash writes the prefix into the hasher before rlp-encoding x.
 // It's used for typed transactions.
 func prefixedRlpHash(prefix byte, x interface{}) (h common.Hash) {
-	sha := hasherPool.Get().(crypto.KeccakState)
+	sha := hasherPool.Get().(keccak.KeccakState)
 	defer hasherPool.Put(sha)
 	sha.Reset()
 	sha.Write([]byte{prefix})
--- a/crypto/crypto.go
+++ b/crypto/crypto.go
@ -24,13 +24,13 @@ import (
 	"encoding/hex"
 	"errors"
 	"fmt"
-	"hash"
 	"io"
 	"math/big"
 	"os"

 	"github.com/ethereum/go-ethereum/common"
 	"github.com/ethereum/go-ethereum/common/math"
+	"github.com/ethereum/go-ethereum/crypto/keccak"
 	"github.com/ethereum/go-ethereum/rlp"
 )

@ -59,16 +59,8 @@ type EllipticCurve interface {
 	Unmarshal(data []byte) (x, y *big.Int)
 }

-// KeccakState wraps sha3.state. In addition to the usual hash methods, it also supports
-// Read to get a variable amount of data from the hash state. Read is faster than Sum
-// because it doesn't copy the internal state, but also modifies the internal state.
-type KeccakState interface {
-	hash.Hash
-	Read([]byte) (int, error)
-}
-
 // HashData hashes the provided data using the KeccakState and returns a 32 byte hash
-func HashData(kh KeccakState, data []byte) (h common.Hash) {
+func HashData(kh keccak.KeccakState, data []byte) (h common.Hash) {
 	kh.Reset()
 	kh.Write(data)
 	kh.Read(h[:])
--- a/crypto/keccak.go
+++ b/crypto/keccak.go
@ -26,38 +26,40 @@ import (
 )

 // NewKeccakState creates a new KeccakState
-func NewKeccakState() KeccakState {
-	return keccak.NewLegacyKeccak256().(KeccakState)
+func NewKeccakState() keccak.KeccakState {
+	h := hasherPool.Get().(keccak.KeccakState)
+	h.Reset()
+	return h
 }

+func ReturnToPool(h keccak.KeccakState) { hasherPool.Put(h) }
+
 var hasherPool = sync.Pool{
 	New: func() any {
-		return keccak.NewLegacyKeccak256().(KeccakState)
+		return keccak.NewLegacyKeccak256()
 	},
 }

 // Keccak256 calculates and returns the Keccak256 hash of the input data.
 func Keccak256(data ...[]byte) []byte {
 	b := make([]byte, 32)
-	d := hasherPool.Get().(KeccakState)
-	d.Reset()
+	d := NewKeccakState()
 	for _, b := range data {
 		d.Write(b)
 	}
 	d.Read(b)
-	hasherPool.Put(d)
+	ReturnToPool(d)
 	return b
 }

 // Keccak256Hash calculates and returns the Keccak256 hash of the input data,
 // converting it to an internal Hash data structure.
 func Keccak256Hash(data ...[]byte) (h common.Hash) {
-	d := hasherPool.Get().(KeccakState)
-	d.Reset()
+	d := NewKeccakState()
 	for _, b := range data {
 		d.Write(b)
 	}
-	d.Read(h[:])
-	hasherPool.Put(d)
+	d.Read(h[:]) //nolint:errcheck
+	ReturnToPool(d)
 	return h
 }
--- a/crypto/keccak/hashes.go
+++ b/crypto/keccak/hashes.go
@ -1,44 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package keccak
-
-// This file provides functions for creating instances of the SHA-3
-// and SHAKE hash functions, as well as utility functions for hashing
-// bytes.
-
-import (
-	"hash"
-)
-
-const (
-	dsbyteSHA3   = 0b00000110
-	dsbyteKeccak = 0b00000001
-	dsbyteShake  = 0b00011111
-	dsbyteCShake = 0b00000100
-
-	// rateK[c] is the rate in bytes for Keccak[c] where c is the capacity in
-	// bits. Given the sponge size is 1600 bits, the rate is 1600 - c bits.
-	rateK256  = (1600 - 256) / 8
-	rateK448  = (1600 - 448) / 8
-	rateK512  = (1600 - 512) / 8
-	rateK768  = (1600 - 768) / 8
-	rateK1024 = (1600 - 1024) / 8
-)
-
-// NewLegacyKeccak256 creates a new Keccak-256 hash.
-//
-// Only use this function if you require compatibility with an existing cryptosystem
-// that uses non-standard padding. All other users should use New256 instead.
-func NewLegacyKeccak256() hash.Hash {
-	return &state{rate: rateK512, outputLen: 32, dsbyte: dsbyteKeccak}
-}
-
-// NewLegacyKeccak512 creates a new Keccak-512 hash.
-//
-// Only use this function if you require compatibility with an existing cryptosystem
-// that uses non-standard padding. All other users should use New512 instead.
-func NewLegacyKeccak512() hash.Hash {
-	return &state{rate: rateK1024, outputLen: 64, dsbyte: dsbyteKeccak}
-}
--- a/crypto/keccak/keccaf_arm64.s
+++ b/crypto/keccak/keccaf_arm64.s
@ -0,0 +1,226 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !purego
+
+#include "textflag.h"
+
+// func keccakF1600Sha3(a *[200]byte, buf *byte)
+// When buf != nil, XORs rate bytes into state before permuting.
+// When buf == nil, just permutes.
+TEXT ·keccakF1600Sha3(SB), $200-16
+	MOVD	a+0(FP), R0
+	MOVD	buf+8(FP), R3
+	MOVD	$round_consts<>(SB), R1
+	MOVD	$24, R2 // counter for loop
+
+	CBZ	R3, load_state
+
+	// XOR path: load state and XOR with buf (17 lanes = 136 bytes)
+	VLD1.P	16(R0), [V0.D1, V1.D1]
+	VLD1.P	16(R3), [V25.D1, V26.D1]
+	VEOR	V25.B16, V0.B16, V0.B16
+	VEOR	V26.B16, V1.B16, V1.B16
+
+	VLD1.P	16(R0), [V2.D1, V3.D1]
+	VLD1.P	16(R3), [V25.D1, V26.D1]
+	VEOR	V25.B16, V2.B16, V2.B16
+	VEOR	V26.B16, V3.B16, V3.B16
+
+	VLD1.P	16(R0), [V4.D1, V5.D1]
+	VLD1.P	16(R3), [V25.D1, V26.D1]
+	VEOR	V25.B16, V4.B16, V4.B16
+	VEOR	V26.B16, V5.B16, V5.B16
+
+	VLD1.P	16(R0), [V6.D1, V7.D1]
+	VLD1.P	16(R3), [V25.D1, V26.D1]
+	VEOR	V25.B16, V6.B16, V6.B16
+	VEOR	V26.B16, V7.B16, V7.B16
+
+	VLD1.P	16(R0), [V8.D1, V9.D1]
+	VLD1.P	16(R3), [V25.D1, V26.D1]
+	VEOR	V25.B16, V8.B16, V8.B16
+	VEOR	V26.B16, V9.B16, V9.B16
+
+	VLD1.P	16(R0), [V10.D1, V11.D1]
+	VLD1.P	16(R3), [V25.D1, V26.D1]
+	VEOR	V25.B16, V10.B16, V10.B16
+	VEOR	V26.B16, V11.B16, V11.B16
+
+	VLD1.P	16(R0), [V12.D1, V13.D1]
+	VLD1.P	16(R3), [V25.D1, V26.D1]
+	VEOR	V25.B16, V12.B16, V12.B16
+	VEOR	V26.B16, V13.B16, V13.B16
+
+	VLD1.P	16(R0), [V14.D1, V15.D1]
+	VLD1.P	16(R3), [V25.D1, V26.D1]
+	VEOR	V25.B16, V14.B16, V14.B16
+	VEOR	V26.B16, V15.B16, V15.B16
+
+	// Lane 16: last data lane (8 bytes at buf offset 128)
+	VLD1.P	16(R0), [V16.D1, V17.D1]
+	VLD1	(R3), [V25.D1]
+	VEOR	V25.B16, V16.B16, V16.B16
+
+	// Remaining state lanes 18-24 (no data to XOR)
+	VLD1.P	16(R0), [V18.D1, V19.D1]
+	VLD1.P	16(R0), [V20.D1, V21.D1]
+	VLD1.P	16(R0), [V22.D1, V23.D1]
+	VLD1	(R0), [V24.D1]
+
+	SUB	$192, R0, R0
+	B	rounds
+
+load_state:
+	VLD1.P	16(R0), [V0.D1, V1.D1]
+	VLD1.P	16(R0), [V2.D1, V3.D1]
+	VLD1.P	16(R0), [V4.D1, V5.D1]
+	VLD1.P	16(R0), [V6.D1, V7.D1]
+	VLD1.P	16(R0), [V8.D1, V9.D1]
+	VLD1.P	16(R0), [V10.D1, V11.D1]
+	VLD1.P	16(R0), [V12.D1, V13.D1]
+	VLD1.P	16(R0), [V14.D1, V15.D1]
+	VLD1.P	16(R0), [V16.D1, V17.D1]
+	VLD1.P	16(R0), [V18.D1, V19.D1]
+	VLD1.P	16(R0), [V20.D1, V21.D1]
+	VLD1.P	16(R0), [V22.D1, V23.D1]
+	VLD1	(R0), [V24.D1]
+
+	SUB	$192, R0, R0
+
+rounds:
+	// theta
+	VEOR3	 V20.B16, V15.B16, V10.B16, V25.B16
+	VEOR3	 V21.B16, V16.B16, V11.B16, V26.B16
+	VEOR3	 V22.B16, V17.B16, V12.B16, V27.B16
+	VEOR3	 V23.B16, V18.B16, V13.B16, V28.B16
+	VEOR3	 V24.B16, V19.B16, V14.B16, V29.B16
+	VEOR3	 V25.B16, V5.B16, V0.B16, V25.B16
+	VEOR3	 V26.B16, V6.B16, V1.B16, V26.B16
+	VEOR3	 V27.B16, V7.B16, V2.B16, V27.B16
+	VEOR3	 V28.B16, V8.B16, V3.B16, V28.B16
+	VEOR3	 V29.B16, V9.B16, V4.B16, V29.B16
+
+	VRAX1	V27.D2, V25.D2, V30.D2
+	VRAX1	V28.D2, V26.D2, V31.D2
+	VRAX1	V29.D2, V27.D2, V27.D2
+	VRAX1	V25.D2, V28.D2, V28.D2
+	VRAX1	V26.D2, V29.D2, V29.D2
+
+	// theta and rho and Pi
+	VEOR	V29.B16, V0.B16, V0.B16
+
+	VXAR	$63, V30.D2, V1.D2, V25.D2
+
+	VXAR	$20, V30.D2, V6.D2, V1.D2
+	VXAR	$44, V28.D2, V9.D2, V6.D2
+	VXAR	$3, V31.D2, V22.D2, V9.D2
+	VXAR	$25, V28.D2, V14.D2, V22.D2
+	VXAR	$46, V29.D2, V20.D2, V14.D2
+
+	VXAR	$2, V31.D2, V2.D2, V26.D2
+
+	VXAR	$21, V31.D2, V12.D2, V2.D2
+	VXAR	$39, V27.D2, V13.D2, V12.D2
+	VXAR	$56, V28.D2, V19.D2, V13.D2
+	VXAR	$8, V27.D2, V23.D2, V19.D2
+	VXAR	$23, V29.D2, V15.D2, V23.D2
+
+	VXAR	$37, V28.D2, V4.D2, V15.D2
+
+	VXAR	$50, V28.D2, V24.D2, V28.D2
+	VXAR	$62, V30.D2, V21.D2, V24.D2
+	VXAR	$9, V27.D2, V8.D2, V8.D2
+	VXAR	$19, V30.D2, V16.D2, V4.D2
+	VXAR	$28, V29.D2, V5.D2, V16.D2
+
+	VXAR	$36, V27.D2, V3.D2, V5.D2
+
+	VXAR	$43, V27.D2, V18.D2, V27.D2
+	VXAR	$49, V31.D2, V17.D2, V3.D2
+	VXAR	$54, V30.D2, V11.D2, V30.D2
+	VXAR	$58, V31.D2, V7.D2, V31.D2
+	VXAR	$61, V29.D2, V10.D2, V29.D2
+
+	// chi and iota
+	VBCAX	V8.B16, V22.B16, V26.B16, V20.B16
+	VBCAX	V22.B16, V23.B16, V8.B16, V21.B16
+	VBCAX	V23.B16, V24.B16, V22.B16, V22.B16
+	VBCAX	V24.B16, V26.B16, V23.B16, V23.B16
+	VBCAX	V26.B16, V8.B16, V24.B16, V24.B16
+
+	VLD1R.P	8(R1), [V26.D2]
+
+	VBCAX	V3.B16, V19.B16, V30.B16, V17.B16
+	VBCAX	V19.B16, V15.B16, V3.B16, V18.B16
+	VBCAX	V15.B16, V16.B16, V19.B16, V19.B16
+	VBCAX	V16.B16, V30.B16, V15.B16, V15.B16
+	VBCAX	V30.B16, V3.B16, V16.B16, V16.B16
+
+	VBCAX	V31.B16, V12.B16, V25.B16, V10.B16
+	VBCAX	V12.B16, V13.B16, V31.B16, V11.B16
+	VBCAX	V13.B16, V14.B16, V12.B16, V12.B16
+	VBCAX	V14.B16, V25.B16, V13.B16, V13.B16
+	VBCAX	V25.B16, V31.B16, V14.B16, V14.B16
+
+	VBCAX	V4.B16, V9.B16, V29.B16, V7.B16
+	VBCAX	V9.B16, V5.B16, V4.B16, V8.B16
+	VBCAX	V5.B16, V6.B16, V9.B16, V9.B16
+	VBCAX	V6.B16, V29.B16, V5.B16, V5.B16
+	VBCAX	V29.B16, V4.B16, V6.B16, V6.B16
+
+	VBCAX	V28.B16, V0.B16, V27.B16, V3.B16
+	VBCAX	V0.B16, V1.B16, V28.B16, V4.B16
+
+	VBCAX	V1.B16, V2.B16, V0.B16, V0.B16  // iota (chi part)
+
+	VBCAX	V2.B16, V27.B16, V1.B16, V1.B16
+	VBCAX	V27.B16, V28.B16, V2.B16, V2.B16
+
+	VEOR	V26.B16, V0.B16, V0.B16 // iota
+
+	SUB		$1, R2, R2
+	CBNZ	R2, rounds
+
+	VST1.P	[V0.D1, V1.D1], 16(R0)
+	VST1.P	[V2.D1, V3.D1], 16(R0)
+	VST1.P	[V4.D1, V5.D1], 16(R0)
+	VST1.P	[V6.D1, V7.D1], 16(R0)
+	VST1.P	[V8.D1, V9.D1], 16(R0)
+	VST1.P	[V10.D1, V11.D1], 16(R0)
+	VST1.P	[V12.D1, V13.D1], 16(R0)
+	VST1.P	[V14.D1, V15.D1], 16(R0)
+	VST1.P	[V16.D1, V17.D1], 16(R0)
+	VST1.P	[V18.D1, V19.D1], 16(R0)
+	VST1.P	[V20.D1, V21.D1], 16(R0)
+	VST1.P	[V22.D1, V23.D1], 16(R0)
+	VST1	[V24.D1], (R0)
+
+	RET
+
+DATA	round_consts<>+0x00(SB)/8, $0x0000000000000001
+DATA	round_consts<>+0x08(SB)/8, $0x0000000000008082
+DATA	round_consts<>+0x10(SB)/8, $0x800000000000808a
+DATA	round_consts<>+0x18(SB)/8, $0x8000000080008000
+DATA	round_consts<>+0x20(SB)/8, $0x000000000000808b
+DATA	round_consts<>+0x28(SB)/8, $0x0000000080000001
+DATA	round_consts<>+0x30(SB)/8, $0x8000000080008081
+DATA	round_consts<>+0x38(SB)/8, $0x8000000000008009
+DATA	round_consts<>+0x40(SB)/8, $0x000000000000008a
+DATA	round_consts<>+0x48(SB)/8, $0x0000000000000088
+DATA	round_consts<>+0x50(SB)/8, $0x0000000080008009
+DATA	round_consts<>+0x58(SB)/8, $0x000000008000000a
+DATA	round_consts<>+0x60(SB)/8, $0x000000008000808b
+DATA	round_consts<>+0x68(SB)/8, $0x800000000000008b
+DATA	round_consts<>+0x70(SB)/8, $0x8000000000008089
+DATA	round_consts<>+0x78(SB)/8, $0x8000000000008003
+DATA	round_consts<>+0x80(SB)/8, $0x8000000000008002
+DATA	round_consts<>+0x88(SB)/8, $0x8000000000000080
+DATA	round_consts<>+0x90(SB)/8, $0x000000000000800a
+DATA	round_consts<>+0x98(SB)/8, $0x800000008000000a
+DATA	round_consts<>+0xA0(SB)/8, $0x8000000080008081
+DATA	round_consts<>+0xA8(SB)/8, $0x8000000000008080
+DATA	round_consts<>+0xB0(SB)/8, $0x0000000080000001
+DATA	round_consts<>+0xB8(SB)/8, $0x8000000080008008
+GLOBL	round_consts<>(SB), NOPTR|RODATA, $192
--- a/crypto/keccak/keccak.go
+++ b/crypto/keccak/keccak.go
@ -0,0 +1,20 @@
+// Package keccak provides Keccak-256 hashing with platform-specific acceleration.
+package keccak
+
+import "hash"
+
+// KeccakState wraps the keccak hasher. In addition to the usual hash methods, it also supports
+// Read to get a variable amount of data from the hash state. Read is faster than Sum
+// because it doesn't copy the internal state, but also modifies the internal state.
+type KeccakState interface {
+	hash.Hash
+	Read([]byte) (int, error)
+}
+
+const rate = 136 // sponge rate for Keccak-256: (1600 - 2*256) / 8
+
+var _ KeccakState = (*Hasher)(nil)
+
+func NewLegacyKeccak256() *Hasher {
+	return &Hasher{}
+}
--- a/crypto/keccak/keccak_arm64.go
+++ b/crypto/keccak/keccak_arm64.go
@ -0,0 +1,30 @@
+//go:build arm64 && !purego
+
+package keccak
+
+import (
+	"runtime"
+
+	"golang.org/x/sys/cpu"
+)
+
+// Apple Silicon always has Armv8.2-A SHA3 extensions (VEOR3, VRAX1, VXAR, VBCAX).
+// On other ARM64 platforms, detect at runtime via CPU feature flags.
+// When SHA3 is unavailable, falls back to x/crypto/sha3.
+func init() {
+	useASM = runtime.GOOS == "darwin" || runtime.GOOS == "ios" || cpu.ARM64.HasSHA3
+}
+
+// keccakF1600Sha3 permutes state. When buf != nil, it first XORs rate bytes
+// of buf into state, saving one full memory pass.
+//
+//go:noescape
+func keccakF1600Sha3(a *[200]byte, buf *byte)
+
+func keccakF1600(a *[200]byte) {
+	keccakF1600Sha3(a, nil)
+}
+
+func xorAndPermute(state *[200]byte, buf *byte) {
+	keccakF1600Sha3(state, buf)
+}
--- a/crypto/keccak/keccak_asm.go
+++ b/crypto/keccak/keccak_asm.go
@ -0,0 +1,233 @@
+//go:build (amd64 || arm64) && !purego
+
+package keccak
+
+import (
+	"encoding/binary"
+
+	"golang.org/x/crypto/sha3"
+)
+
+// useASM is set by platform-specific init to indicate hardware acceleration is available.
+// When false, Sum256 and Hasher fall back to x/crypto/sha3.
+var useASM bool
+
+// sponge is the core Keccak-256 sponge state used by native (asm) implementations.
+type sponge struct {
+	state     [200]byte
+	buf       [rate]byte
+	absorbed  int
+	squeezing bool
+	readIdx   int // index into state for next Read byte
+}
+
+// Reset resets the sponge to its initial state.
+func (s *sponge) Reset() {
+	s.state = [200]byte{}
+	s.absorbed = 0
+	s.squeezing = false
+	s.readIdx = 0
+}
+
+// Write absorbs data into the sponge.
+// Panics if called after Read.
+func (s *sponge) Write(p []byte) (int, error) {
+	if s.squeezing {
+		panic("keccak: Write after Read")
+	}
+	n := len(p)
+	if s.absorbed > 0 {
+		x := copy(s.buf[s.absorbed:rate], p)
+		s.absorbed += x
+		p = p[x:]
+		if s.absorbed == rate {
+			xorAndPermute(&s.state, &s.buf[0])
+			s.absorbed = 0
+		}
+	}
+
+	for len(p) >= rate {
+		xorAndPermute(&s.state, &p[0])
+		p = p[rate:]
+	}
+
+	if len(p) > 0 {
+		s.absorbed = copy(s.buf[:], p)
+	}
+	return n, nil
+}
+
+// Sum256 finalizes and returns the 32-byte Keccak-256 digest.
+// Does not modify the sponge state.
+// Panics if called after Read.
+func (s *sponge) Sum256() [32]byte {
+	if s.squeezing {
+		panic("keccak: Sum after Read")
+	}
+	state := s.state
+	xorIn(&state, s.buf[:s.absorbed])
+	state[s.absorbed] ^= 0x01
+	state[rate-1] ^= 0x80
+	keccakF1600(&state)
+	return [32]byte(state[:32])
+}
+
+// Sum appends the current Keccak-256 digest to b and returns the resulting slice.
+// Does not modify the sponge state.
+func (s *sponge) Sum(b []byte) []byte {
+	d := s.Sum256()
+	return append(b, d[:]...)
+}
+
+// Size returns the number of bytes Sum will produce (32).
+func (s *sponge) Size() int { return 32 }
+
+// BlockSize returns the sponge rate in bytes (136).
+func (s *sponge) BlockSize() int { return rate }
+
+// Read squeezes an arbitrary number of bytes from the sponge.
+// On the first call, it pads and permutes, transitioning from absorbing to squeezing.
+// Subsequent calls to Write will panic. It never returns an error.
+func (s *sponge) Read(out []byte) (int, error) {
+	if !s.squeezing {
+		s.padAndSqueeze()
+	}
+
+	n := len(out)
+	for len(out) > 0 {
+		x := copy(out, s.state[s.readIdx:rate])
+		s.readIdx += x
+		out = out[x:]
+		if s.readIdx == rate {
+			keccakF1600(&s.state)
+			s.readIdx = 0
+		}
+	}
+	return n, nil
+}
+
+func (s *sponge) padAndSqueeze() {
+	xorIn(&s.state, s.buf[:s.absorbed])
+	s.state[s.absorbed] ^= 0x01
+	s.state[rate-1] ^= 0x80
+	keccakF1600(&s.state)
+	s.squeezing = true
+	s.readIdx = 0
+}
+
+// sum256Sponge computes Keccak-256 in one shot using the assembly permutation.
+func sum256Sponge(data []byte) [32]byte {
+	var state [200]byte
+
+	for len(data) >= rate {
+		xorAndPermute(&state, &data[0])
+		data = data[rate:]
+	}
+
+	xorIn(&state, data)
+	state[len(data)] ^= 0x01
+	state[rate-1] ^= 0x80
+	keccakF1600(&state)
+
+	return [32]byte(state[:32])
+}
+
+// Sum256 computes the Keccak-256 hash of data. Zero heap allocations when hardware
+// acceleration is available.
+func Sum256(data []byte) [32]byte {
+	if !useASM {
+		return sum256XCrypto(data)
+	}
+	return sum256Sponge(data)
+}
+
+func sum256XCrypto(data []byte) [32]byte {
+	h := sha3.NewLegacyKeccak256()
+	h.Write(data)
+	var out [32]byte
+	h.Sum(out[:0])
+	return out
+}
+
+// Hasher is a streaming Keccak-256 hasher.
+// Uses platform assembly when available, x/crypto/sha3 otherwise.
+type Hasher struct {
+	sponge
+	xc KeccakState // x/crypto fallback
+}
+
+// Reset resets the hasher to its initial state.
+func (h *Hasher) Reset() {
+	if useASM {
+		h.sponge.Reset()
+	} else {
+		if h.xc == nil {
+			h.xc = sha3.NewLegacyKeccak256().(KeccakState)
+		} else {
+			h.xc.Reset()
+		}
+	}
+}
+
+// Write absorbs data into the hasher.
+// Panics if called after Read.
+func (h *Hasher) Write(p []byte) (int, error) {
+	if !useASM {
+		if h.xc == nil {
+			h.xc = sha3.NewLegacyKeccak256().(KeccakState)
+		}
+		return h.xc.Write(p)
+	}
+	return h.sponge.Write(p)
+}
+
+// Sum256 finalizes and returns the 32-byte Keccak-256 digest.
+// Does not modify the hasher state.
+func (h *Hasher) Sum256() [32]byte {
+	if !useASM {
+		if h.xc == nil {
+			return Sum256(nil)
+		}
+		var out [32]byte
+		h.xc.Sum(out[:0])
+		return out
+	}
+	return h.sponge.Sum256()
+}
+
+// Sum appends the current Keccak-256 digest to b and returns the resulting slice.
+// Does not modify the hasher state.
+func (h *Hasher) Sum(b []byte) []byte {
+	if !useASM {
+		if h.xc == nil {
+			d := Sum256(nil)
+			return append(b, d[:]...)
+		}
+		return h.xc.Sum(b)
+	}
+	return h.sponge.Sum(b)
+}
+
+// Read squeezes an arbitrary number of bytes from the sponge.
+// On the first call, it pads and permutes, transitioning from absorbing to squeezing.
+// Subsequent calls to Write will panic. It never returns an error.
+func (h *Hasher) Read(out []byte) (int, error) {
+	if !useASM {
+		if h.xc == nil {
+			h.xc = sha3.NewLegacyKeccak256().(KeccakState)
+		}
+		return h.xc.Read(out)
+	}
+	return h.sponge.Read(out)
+}
+
+// xorIn XORs data into the first len(data) bytes of state using uint64 loads.
+func xorIn(state *[200]byte, data []byte) {
+	for i := 0; i+8 <= len(data); i += 8 {
+		v := binary.LittleEndian.Uint64(state[i:]) ^ binary.LittleEndian.Uint64(data[i:])
+		binary.LittleEndian.PutUint64(state[i:], v)
+	}
+	for i := len(data) &^ 7; i < len(data); i++ {
+		state[i] ^= data[i]
+	}
+}
--- a/crypto/keccak/keccak_default.go
+++ b/crypto/keccak/keccak_default.go
@ -0,0 +1,71 @@
+//go:build (!arm64 && !amd64) || purego
+
+package keccak
+
+import (
+	"golang.org/x/crypto/sha3"
+)
+
+// Sum256 computes the Keccak-256 hash of data.
+// On non-arm64 platforms, delegates to x/crypto/sha3.NewLegacyKeccak256().
+func Sum256(data []byte) [32]byte {
+	h := sha3.NewLegacyKeccak256()
+	h.Write(data)
+	var out [32]byte
+	h.Sum(out[:0])
+	return out
+}
+
+// Hasher is a streaming Keccak-256 hasher wrapping x/crypto/sha3.
+type Hasher struct {
+	h KeccakState
+}
+
+func (h *Hasher) init() {
+	if h.h == nil {
+		h.h = sha3.NewLegacyKeccak256().(KeccakState)
+	}
+}
+
+// Reset resets the hasher to its initial state.
+func (h *Hasher) Reset() {
+	h.init()
+	h.h.Reset()
+}
+
+// Write absorbs data into the hasher.
+// Panics if called after Read.
+func (h *Hasher) Write(p []byte) (int, error) {
+	h.init()
+	return h.h.Write(p)
+}
+
+// Sum256 finalizes and returns the 32-byte Keccak-256 digest.
+// Does not modify the hasher state.
+func (h *Hasher) Sum256() [32]byte {
+	h.init()
+	var out [32]byte
+	h.h.Sum(out[:0])
+	return out
+}
+
+// Sum appends the current Keccak-256 digest to b and returns the resulting slice.
+// Does not modify the hasher state.
+func (h *Hasher) Sum(b []byte) []byte {
+	h.init()
+	return h.h.Sum(b)
+}
+
+// Size returns the number of bytes Sum will produce (32).
+func (h *Hasher) Size() int { return 32 }
+
+// BlockSize returns the sponge rate in bytes (136).
+func (h *Hasher) BlockSize() int { return rate }
+
+// Read squeezes an arbitrary number of bytes from the sponge.
+// On the first call, it pads and permutes, transitioning from absorbing to squeezing.
+// Subsequent calls to Write will panic. It never returns an error.
+func (h *Hasher) Read(out []byte) (int, error) {
+	h.init()
+	return h.h.Read(out)
+}
--- a/crypto/keccak/keccak_test.go
+++ b/crypto/keccak/keccak_test.go
@ -0,0 +1,353 @@
+package keccak
+
+import (
+	"bytes"
+	"encoding/hex"
+	"fmt"
+	"testing"
+
+	"golang.org/x/crypto/sha3"
+)
+
+func TestSum256Empty(t *testing.T) {
+	got := Sum256(nil)
+	// Known Keccak-256 of empty string.
+	want, _ := hex.DecodeString("c5d2460186f7233c927e7db2dcc703c0e500b653ca82273b7bfad8045d85a470")
+	if !bytes.Equal(got[:], want) {
+		t.Fatalf("Sum256(nil) = %x, want %x", got, want)
+	}
+}
+
+func TestSum256Hello(t *testing.T) {
+	got := Sum256([]byte("hello"))
+	want, _ := hex.DecodeString("1c8aff950685c2ed4bc3174f3472287b56d9517b9c948127319a09a7a36deac8")
+	if !bytes.Equal(got[:], want) {
+		t.Fatalf("Sum256(hello) = %x, want %x", got, want)
+	}
+}
+
+func TestSum256LargeData(t *testing.T) {
+	// Test with data larger than one block (rate=136 bytes).
+	data := make([]byte, 500)
+	for i := range data {
+		data[i] = byte(i)
+	}
+	got := Sum256(data)
+	// Verify against streaming Hasher.
+	var h Hasher
+	h.Write(data)
+	want := h.Sum256()
+	if got != want {
+		t.Fatalf("Sum256 vs Hasher mismatch: %x vs %x", got, want)
+	}
+}
+
+func TestHasherStreaming(t *testing.T) {
+	data := []byte("hello world, this is a longer test string for streaming keccak")
+	// All at once.
+	want := Sum256(data)
+	// Byte by byte.
+	var h Hasher
+	for _, b := range data {
+		h.Write([]byte{b})
+	}
+	got := h.Sum256()
+	if got != want {
+		t.Fatalf("streaming byte-by-byte: %x vs %x", got, want)
+	}
+}
+
+func TestHasherMultiBlock(t *testing.T) {
+	// Test with exactly 2 blocks + partial.
+	data := make([]byte, rate*2+50)
+	for i := range data {
+		data[i] = byte(i * 7)
+	}
+	want := Sum256(data)
+	// Write in chunks of 37 (not aligned to rate).
+	var h Hasher
+	for i := 0; i < len(data); i += 37 {
+		end := i + 37
+		if end > len(data) {
+			end = len(data)
+		}
+		h.Write(data[i:end])
+	}
+	got := h.Sum256()
+	if got != want {
+		t.Fatalf("multi-block streaming: %x vs %x", got, want)
+	}
+}
+
+func TestReadMatchesSum256(t *testing.T) {
+	// Read of 32 bytes should produce the same result as Sum256.
+	data := []byte("hello")
+	var h Hasher
+	h.Write(data)
+	var got [32]byte
+	h.Read(got[:])
+	want := Sum256(data)
+	if got != want {
+		t.Fatalf("Read(32) = %x, want %x", got, want)
+	}
+}
+
+func TestReadMatchesXCrypto(t *testing.T) {
+	// Compare Read output against x/crypto/sha3 for various lengths.
+	for _, readLen := range []int{32, 64, 136, 200, 500} {
+		data := []byte("test data for read comparison")
+		ref := sha3.NewLegacyKeccak256()
+		ref.Write(data)
+		want := make([]byte, readLen)
+		ref.(KeccakState).Read(want)
+
+		var h Hasher
+		h.Write(data)
+		got := make([]byte, readLen)
+		h.Read(got)
+		if !bytes.Equal(got, want) {
+			t.Fatalf("Read(%d) mismatch:\ngot:  %x\nwant: %x", readLen, got, want)
+		}
+	}
+}
+
+func TestReadMultipleCalls(t *testing.T) {
+	// Multiple Read calls should produce the same output as one large Read.
+	data := []byte("streaming read test")
+
+	// One large read.
+	var h1 Hasher
+	h1.Write(data)
+	all := make([]byte, 300)
+	h1.Read(all)
+
+	// Multiple small reads.
+	var h2 Hasher
+	h2.Write(data)
+	var parts []byte
+	for i := 0; i < 300; {
+		chunk := 37
+		if i+chunk > 300 {
+			chunk = 300 - i
+		}
+		buf := make([]byte, chunk)
+		h2.Read(buf)
+		parts = append(parts, buf...)
+		i += chunk
+	}
+	if !bytes.Equal(all, parts) {
+		t.Fatalf("multi-read mismatch:\ngot:  %x\nwant: %x", parts, all)
+	}
+}
+
+func TestReadEmpty(t *testing.T) {
+	// Read from hasher with no data written.
+	ref := sha3.NewLegacyKeccak256()
+	want := make([]byte, 32)
+	ref.(KeccakState).Read(want)
+
+	var h Hasher
+	got := make([]byte, 32)
+	h.Read(got)
+	if !bytes.Equal(got, want) {
+		t.Fatalf("Read empty mismatch:\ngot:  %x\nwant: %x", got, want)
+	}
+}
+
+func TestReadAfterReset(t *testing.T) {
+	var h Hasher
+	h.Write([]byte("first"))
+	h.Read(make([]byte, 32))
+
+	// Reset should allow Write again.
+	h.Reset()
+	h.Write([]byte("second"))
+	got := make([]byte, 32)
+	h.Read(got)
+
+	want := Sum256([]byte("second"))
+	if !bytes.Equal(got, want[:]) {
+		t.Fatalf("Read after Reset mismatch:\ngot:  %x\nwant: %x", got, want)
+	}
+}
+
+func TestWriteAfterReadPanics(t *testing.T) {
+	defer func() {
+		if r := recover(); r == nil {
+			t.Fatal("expected panic on Write after Read")
+		}
+	}()
+	var h Hasher
+	h.Write([]byte("data"))
+	h.Read(make([]byte, 32))
+	h.Write([]byte("more")) // should panic
+}
+
+func FuzzSum256(f *testing.F) {
+	f.Add([]byte(nil))
+	f.Add([]byte("hello"))
+	f.Add([]byte("hello world, this is a longer test string for streaming keccak"))
+	f.Add(make([]byte, rate))
+	f.Add(make([]byte, rate+1))
+	f.Add(make([]byte, rate*3+50))
+
+	f.Fuzz(func(t *testing.T, data []byte) {
+		// Reference: x/crypto NewLegacyKeccak256.
+		ref := sha3.NewLegacyKeccak256()
+		ref.Write(data)
+		want := ref.Sum(nil)
+
+		// Test Sum256.
+		got := Sum256(data)
+		if !bytes.Equal(got[:], want) {
+			t.Fatalf("Sum256 mismatch for len=%d\ngot:  %x\nwant: %x", len(data), got, want)
+		}
+
+		// Test streaming Hasher (write all at once).
+		var h Hasher
+		h.Write(data)
+		gotH := h.Sum256()
+		if !bytes.Equal(gotH[:], want) {
+			t.Fatalf("Hasher mismatch for len=%d\ngot:  %x\nwant: %x", len(data), gotH, want)
+		}
+
+		// Test streaming Hasher (byte-by-byte).
+		h.Reset()
+		for _, b := range data {
+			h.Write([]byte{b})
+		}
+		gotS := h.Sum256()
+		if !bytes.Equal(gotS[:], want) {
+			t.Fatalf("Hasher byte-by-byte mismatch for len=%d\ngot:  %x\nwant: %x", len(data), gotS, want)
+		}
+
+		// Test Read (32 bytes) matches Sum256.
+		h.Reset()
+		h.Write(data)
+		gotRead := make([]byte, 32)
+		h.Read(gotRead)
+		if !bytes.Equal(gotRead, want) {
+			t.Fatalf("Read(32) mismatch for len=%d\ngot:  %x\nwant: %x", len(data), gotRead, want)
+		}
+
+		// Test Read (extended output) matches x/crypto.
+		ref.Reset()
+		ref.Write(data)
+		wantExt := make([]byte, 200)
+		ref.(KeccakState).Read(wantExt)
+
+		h.Reset()
+		h.Write(data)
+		gotExt := make([]byte, 200)
+		h.Read(gotExt)
+		if !bytes.Equal(gotExt, wantExt) {
+			t.Fatalf("Read(200) mismatch for len=%d\ngot:  %x\nwant: %x", len(data), gotExt, wantExt)
+		}
+	})
+}
+
+// Comparison benchmarks: faster_keccak vs golang.org/x/crypto/sha3.
+var benchSizes = []int{32, 128, 256, 1024, 4096, 500 * 1024}
+
+func benchName(size int) string {
+	if size >= 1024 {
+		return fmt.Sprintf("%dK", size/1024)
+	}
+	return fmt.Sprintf("%dB", size)
+}
+
+// BenchmarkKeccak256Sum tests Sum256 with local faster_keccak implementation.
+func BenchmarkKeccak256Sum(b *testing.B) {
+	for _, size := range benchSizes {
+		data := make([]byte, size)
+		for i := range data {
+			data[i] = byte(i)
+		}
+		b.Run("FasterKeccak/"+benchName(size), func(b *testing.B) {
+			b.SetBytes(int64(size))
+			b.ReportAllocs()
+			for b.Loop() {
+				Sum256(data)
+			}
+		})
+	}
+}
+
+// BenchmarkKeccak256Stdlib tests Sum256 with golang.org/x/crypto/sha3 standard library.
+func BenchmarkKeccak256Stdlib(b *testing.B) {
+	for _, size := range benchSizes {
+		data := make([]byte, size)
+		for i := range data {
+			data[i] = byte(i)
+		}
+		b.Run("StdLib/"+benchName(size), func(b *testing.B) {
+			b.SetBytes(int64(size))
+			b.ReportAllocs()
+			h := sha3.NewLegacyKeccak256()
+			for b.Loop() {
+				h.Reset()
+				h.Write(data)
+				h.Sum(nil)
+			}
+		})
+	}
+}
+
+// BenchmarkKeccak256Hasher tests Hasher.Sum256() with local faster_keccak implementation.
+func BenchmarkKeccak256Hasher(b *testing.B) {
+	for _, size := range benchSizes {
+		data := make([]byte, size)
+		for i := range data {
+			data[i] = byte(i)
+		}
+		b.Run("FasterKeccak/"+benchName(size), func(b *testing.B) {
+			b.SetBytes(int64(size))
+			b.ReportAllocs()
+			var h Hasher
+			for b.Loop() {
+				h.Reset()
+				h.Write(data)
+				h.Sum256()
+			}
+		})
+	}
+}
+
+// BenchmarkKeccak256HasherStdlib tests Hasher API with golang.org/x/crypto/sha3 standard library.
+func BenchmarkKeccak256HasherStdlib(b *testing.B) {
+	for _, size := range benchSizes {
+		data := make([]byte, size)
+		for i := range data {
+			data[i] = byte(i)
+		}
+		b.Run("StdLib/"+benchName(size), func(b *testing.B) {
+			b.SetBytes(int64(size))
+			b.ReportAllocs()
+			h := sha3.NewLegacyKeccak256().(KeccakState)
+			var buf [32]byte
+			for b.Loop() {
+				h.Reset()
+				h.Write(data)
+				h.Read(buf[:])
+			}
+		})
+	}
+}
+
+// BenchmarkKeccakStreaming benchmarks the streaming hasher (Reset+Write+Read).
+// Use with benchstat: go test -bench=BenchmarkKeccakStreaming -benchmem ./... | benchstat
+func BenchmarkKeccakStreaming(b *testing.B) {
+	data := make([]byte, 32)
+	for i := range data {
+		data[i] = byte(i)
+	}
+	var h Hasher
+	var buf [32]byte
+	b.SetBytes(int64(len(data)))
+	b.ReportAllocs()
+	for b.Loop() {
+		h.Reset()
+		h.Write(data)
+		h.Read(buf[:])
+	}
+}
--- a/crypto/keccak/keccakf.go
+++ b/crypto/keccak/keccakf.go
@ -1,414 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build !amd64 || purego || !gc
-
-package keccak
-
-import "math/bits"
-
-// rc stores the round constants for use in the ι step.
-var rc = [24]uint64{
-	0x0000000000000001,
-	0x0000000000008082,
-	0x800000000000808A,
-	0x8000000080008000,
-	0x000000000000808B,
-	0x0000000080000001,
-	0x8000000080008081,
-	0x8000000000008009,
-	0x000000000000008A,
-	0x0000000000000088,
-	0x0000000080008009,
-	0x000000008000000A,
-	0x000000008000808B,
-	0x800000000000008B,
-	0x8000000000008089,
-	0x8000000000008003,
-	0x8000000000008002,
-	0x8000000000000080,
-	0x000000000000800A,
-	0x800000008000000A,
-	0x8000000080008081,
-	0x8000000000008080,
-	0x0000000080000001,
-	0x8000000080008008,
-}
-
-// keccakF1600 applies the Keccak permutation to a 1600b-wide
-// state represented as a slice of 25 uint64s.
-func keccakF1600(a *[25]uint64) {
-	// Implementation translated from Keccak-inplace.c
-	// in the keccak reference code.
-	var t, bc0, bc1, bc2, bc3, bc4, d0, d1, d2, d3, d4 uint64
-
-	for i := 0; i < 24; i += 4 {
-		// Combines the 5 steps in each round into 2 steps.
-		// Unrolls 4 rounds per loop and spreads some steps across rounds.
-
-		// Round 1
-		bc0 = a[0] ^ a[5] ^ a[10] ^ a[15] ^ a[20]
-		bc1 = a[1] ^ a[6] ^ a[11] ^ a[16] ^ a[21]
-		bc2 = a[2] ^ a[7] ^ a[12] ^ a[17] ^ a[22]
-		bc3 = a[3] ^ a[8] ^ a[13] ^ a[18] ^ a[23]
-		bc4 = a[4] ^ a[9] ^ a[14] ^ a[19] ^ a[24]
-		d0 = bc4 ^ (bc1<<1 | bc1>>63)
-		d1 = bc0 ^ (bc2<<1 | bc2>>63)
-		d2 = bc1 ^ (bc3<<1 | bc3>>63)
-		d3 = bc2 ^ (bc4<<1 | bc4>>63)
-		d4 = bc3 ^ (bc0<<1 | bc0>>63)
-
-		bc0 = a[0] ^ d0
-		t = a[6] ^ d1
-		bc1 = bits.RotateLeft64(t, 44)
-		t = a[12] ^ d2
-		bc2 = bits.RotateLeft64(t, 43)
-		t = a[18] ^ d3
-		bc3 = bits.RotateLeft64(t, 21)
-		t = a[24] ^ d4
-		bc4 = bits.RotateLeft64(t, 14)
-		a[0] = bc0 ^ (bc2 &^ bc1) ^ rc[i]
-		a[6] = bc1 ^ (bc3 &^ bc2)
-		a[12] = bc2 ^ (bc4 &^ bc3)
-		a[18] = bc3 ^ (bc0 &^ bc4)
-		a[24] = bc4 ^ (bc1 &^ bc0)
-
-		t = a[10] ^ d0
-		bc2 = bits.RotateLeft64(t, 3)
-		t = a[16] ^ d1
-		bc3 = bits.RotateLeft64(t, 45)
-		t = a[22] ^ d2
-		bc4 = bits.RotateLeft64(t, 61)
-		t = a[3] ^ d3
-		bc0 = bits.RotateLeft64(t, 28)
-		t = a[9] ^ d4
-		bc1 = bits.RotateLeft64(t, 20)
-		a[10] = bc0 ^ (bc2 &^ bc1)
-		a[16] = bc1 ^ (bc3 &^ bc2)
-		a[22] = bc2 ^ (bc4 &^ bc3)
-		a[3] = bc3 ^ (bc0 &^ bc4)
-		a[9] = bc4 ^ (bc1 &^ bc0)
-
-		t = a[20] ^ d0
-		bc4 = bits.RotateLeft64(t, 18)
-		t = a[1] ^ d1
-		bc0 = bits.RotateLeft64(t, 1)
-		t = a[7] ^ d2
-		bc1 = bits.RotateLeft64(t, 6)
-		t = a[13] ^ d3
-		bc2 = bits.RotateLeft64(t, 25)
-		t = a[19] ^ d4
-		bc3 = bits.RotateLeft64(t, 8)
-		a[20] = bc0 ^ (bc2 &^ bc1)
-		a[1] = bc1 ^ (bc3 &^ bc2)
-		a[7] = bc2 ^ (bc4 &^ bc3)
-		a[13] = bc3 ^ (bc0 &^ bc4)
-		a[19] = bc4 ^ (bc1 &^ bc0)
-
-		t = a[5] ^ d0
-		bc1 = bits.RotateLeft64(t, 36)
-		t = a[11] ^ d1
-		bc2 = bits.RotateLeft64(t, 10)
-		t = a[17] ^ d2
-		bc3 = bits.RotateLeft64(t, 15)
-		t = a[23] ^ d3
-		bc4 = bits.RotateLeft64(t, 56)
-		t = a[4] ^ d4
-		bc0 = bits.RotateLeft64(t, 27)
-		a[5] = bc0 ^ (bc2 &^ bc1)
-		a[11] = bc1 ^ (bc3 &^ bc2)
-		a[17] = bc2 ^ (bc4 &^ bc3)
-		a[23] = bc3 ^ (bc0 &^ bc4)
-		a[4] = bc4 ^ (bc1 &^ bc0)
-
-		t = a[15] ^ d0
-		bc3 = bits.RotateLeft64(t, 41)
-		t = a[21] ^ d1
-		bc4 = bits.RotateLeft64(t, 2)
-		t = a[2] ^ d2
-		bc0 = bits.RotateLeft64(t, 62)
-		t = a[8] ^ d3
-		bc1 = bits.RotateLeft64(t, 55)
-		t = a[14] ^ d4
-		bc2 = bits.RotateLeft64(t, 39)
-		a[15] = bc0 ^ (bc2 &^ bc1)
-		a[21] = bc1 ^ (bc3 &^ bc2)
-		a[2] = bc2 ^ (bc4 &^ bc3)
-		a[8] = bc3 ^ (bc0 &^ bc4)
-		a[14] = bc4 ^ (bc1 &^ bc0)
-
-		// Round 2
-		bc0 = a[0] ^ a[5] ^ a[10] ^ a[15] ^ a[20]
-		bc1 = a[1] ^ a[6] ^ a[11] ^ a[16] ^ a[21]
-		bc2 = a[2] ^ a[7] ^ a[12] ^ a[17] ^ a[22]
-		bc3 = a[3] ^ a[8] ^ a[13] ^ a[18] ^ a[23]
-		bc4 = a[4] ^ a[9] ^ a[14] ^ a[19] ^ a[24]
-		d0 = bc4 ^ (bc1<<1 | bc1>>63)
-		d1 = bc0 ^ (bc2<<1 | bc2>>63)
-		d2 = bc1 ^ (bc3<<1 | bc3>>63)
-		d3 = bc2 ^ (bc4<<1 | bc4>>63)
-		d4 = bc3 ^ (bc0<<1 | bc0>>63)
-
-		bc0 = a[0] ^ d0
-		t = a[16] ^ d1
-		bc1 = bits.RotateLeft64(t, 44)
-		t = a[7] ^ d2
-		bc2 = bits.RotateLeft64(t, 43)
-		t = a[23] ^ d3
-		bc3 = bits.RotateLeft64(t, 21)
-		t = a[14] ^ d4
-		bc4 = bits.RotateLeft64(t, 14)
-		a[0] = bc0 ^ (bc2 &^ bc1) ^ rc[i+1]
-		a[16] = bc1 ^ (bc3 &^ bc2)
-		a[7] = bc2 ^ (bc4 &^ bc3)
-		a[23] = bc3 ^ (bc0 &^ bc4)
-		a[14] = bc4 ^ (bc1 &^ bc0)
-
-		t = a[20] ^ d0
-		bc2 = bits.RotateLeft64(t, 3)
-		t = a[11] ^ d1
-		bc3 = bits.RotateLeft64(t, 45)
-		t = a[2] ^ d2
-		bc4 = bits.RotateLeft64(t, 61)
-		t = a[18] ^ d3
-		bc0 = bits.RotateLeft64(t, 28)
-		t = a[9] ^ d4
-		bc1 = bits.RotateLeft64(t, 20)
-		a[20] = bc0 ^ (bc2 &^ bc1)
-		a[11] = bc1 ^ (bc3 &^ bc2)
-		a[2] = bc2 ^ (bc4 &^ bc3)
-		a[18] = bc3 ^ (bc0 &^ bc4)
-		a[9] = bc4 ^ (bc1 &^ bc0)
-
-		t = a[15] ^ d0
-		bc4 = bits.RotateLeft64(t, 18)
-		t = a[6] ^ d1
-		bc0 = bits.RotateLeft64(t, 1)
-		t = a[22] ^ d2
-		bc1 = bits.RotateLeft64(t, 6)
-		t = a[13] ^ d3
-		bc2 = bits.RotateLeft64(t, 25)
-		t = a[4] ^ d4
-		bc3 = bits.RotateLeft64(t, 8)
-		a[15] = bc0 ^ (bc2 &^ bc1)
-		a[6] = bc1 ^ (bc3 &^ bc2)
-		a[22] = bc2 ^ (bc4 &^ bc3)
-		a[13] = bc3 ^ (bc0 &^ bc4)
-		a[4] = bc4 ^ (bc1 &^ bc0)
-
-		t = a[10] ^ d0
-		bc1 = bits.RotateLeft64(t, 36)
-		t = a[1] ^ d1
-		bc2 = bits.RotateLeft64(t, 10)
-		t = a[17] ^ d2
-		bc3 = bits.RotateLeft64(t, 15)
-		t = a[8] ^ d3
-		bc4 = bits.RotateLeft64(t, 56)
-		t = a[24] ^ d4
-		bc0 = bits.RotateLeft64(t, 27)
-		a[10] = bc0 ^ (bc2 &^ bc1)
-		a[1] = bc1 ^ (bc3 &^ bc2)
-		a[17] = bc2 ^ (bc4 &^ bc3)
-		a[8] = bc3 ^ (bc0 &^ bc4)
-		a[24] = bc4 ^ (bc1 &^ bc0)
-
-		t = a[5] ^ d0
-		bc3 = bits.RotateLeft64(t, 41)
-		t = a[21] ^ d1
-		bc4 = bits.RotateLeft64(t, 2)
-		t = a[12] ^ d2
-		bc0 = bits.RotateLeft64(t, 62)
-		t = a[3] ^ d3
-		bc1 = bits.RotateLeft64(t, 55)
-		t = a[19] ^ d4
-		bc2 = bits.RotateLeft64(t, 39)
-		a[5] = bc0 ^ (bc2 &^ bc1)
-		a[21] = bc1 ^ (bc3 &^ bc2)
-		a[12] = bc2 ^ (bc4 &^ bc3)
-		a[3] = bc3 ^ (bc0 &^ bc4)
-		a[19] = bc4 ^ (bc1 &^ bc0)
-
-		// Round 3
-		bc0 = a[0] ^ a[5] ^ a[10] ^ a[15] ^ a[20]
-		bc1 = a[1] ^ a[6] ^ a[11] ^ a[16] ^ a[21]
-		bc2 = a[2] ^ a[7] ^ a[12] ^ a[17] ^ a[22]
-		bc3 = a[3] ^ a[8] ^ a[13] ^ a[18] ^ a[23]
-		bc4 = a[4] ^ a[9] ^ a[14] ^ a[19] ^ a[24]
-		d0 = bc4 ^ (bc1<<1 | bc1>>63)
-		d1 = bc0 ^ (bc2<<1 | bc2>>63)
-		d2 = bc1 ^ (bc3<<1 | bc3>>63)
-		d3 = bc2 ^ (bc4<<1 | bc4>>63)
-		d4 = bc3 ^ (bc0<<1 | bc0>>63)
-
-		bc0 = a[0] ^ d0
-		t = a[11] ^ d1
-		bc1 = bits.RotateLeft64(t, 44)
-		t = a[22] ^ d2
-		bc2 = bits.RotateLeft64(t, 43)
-		t = a[8] ^ d3
-		bc3 = bits.RotateLeft64(t, 21)
-		t = a[19] ^ d4
-		bc4 = bits.RotateLeft64(t, 14)
-		a[0] = bc0 ^ (bc2 &^ bc1) ^ rc[i+2]
-		a[11] = bc1 ^ (bc3 &^ bc2)
-		a[22] = bc2 ^ (bc4 &^ bc3)
-		a[8] = bc3 ^ (bc0 &^ bc4)
-		a[19] = bc4 ^ (bc1 &^ bc0)
-
-		t = a[15] ^ d0
-		bc2 = bits.RotateLeft64(t, 3)
-		t = a[1] ^ d1
-		bc3 = bits.RotateLeft64(t, 45)
-		t = a[12] ^ d2
-		bc4 = bits.RotateLeft64(t, 61)
-		t = a[23] ^ d3
-		bc0 = bits.RotateLeft64(t, 28)
-		t = a[9] ^ d4
-		bc1 = bits.RotateLeft64(t, 20)
-		a[15] = bc0 ^ (bc2 &^ bc1)
-		a[1] = bc1 ^ (bc3 &^ bc2)
-		a[12] = bc2 ^ (bc4 &^ bc3)
-		a[23] = bc3 ^ (bc0 &^ bc4)
-		a[9] = bc4 ^ (bc1 &^ bc0)
-
-		t = a[5] ^ d0
-		bc4 = bits.RotateLeft64(t, 18)
-		t = a[16] ^ d1
-		bc0 = bits.RotateLeft64(t, 1)
-		t = a[2] ^ d2
-		bc1 = bits.RotateLeft64(t, 6)
-		t = a[13] ^ d3
-		bc2 = bits.RotateLeft64(t, 25)
-		t = a[24] ^ d4
-		bc3 = bits.RotateLeft64(t, 8)
-		a[5] = bc0 ^ (bc2 &^ bc1)
-		a[16] = bc1 ^ (bc3 &^ bc2)
-		a[2] = bc2 ^ (bc4 &^ bc3)
-		a[13] = bc3 ^ (bc0 &^ bc4)
-		a[24] = bc4 ^ (bc1 &^ bc0)
-
-		t = a[20] ^ d0
-		bc1 = bits.RotateLeft64(t, 36)
-		t = a[6] ^ d1
-		bc2 = bits.RotateLeft64(t, 10)
-		t = a[17] ^ d2
-		bc3 = bits.RotateLeft64(t, 15)
-		t = a[3] ^ d3
-		bc4 = bits.RotateLeft64(t, 56)
-		t = a[14] ^ d4
-		bc0 = bits.RotateLeft64(t, 27)
-		a[20] = bc0 ^ (bc2 &^ bc1)
-		a[6] = bc1 ^ (bc3 &^ bc2)
-		a[17] = bc2 ^ (bc4 &^ bc3)
-		a[3] = bc3 ^ (bc0 &^ bc4)
-		a[14] = bc4 ^ (bc1 &^ bc0)
-
-		t = a[10] ^ d0
-		bc3 = bits.RotateLeft64(t, 41)
-		t = a[21] ^ d1
-		bc4 = bits.RotateLeft64(t, 2)
-		t = a[7] ^ d2
-		bc0 = bits.RotateLeft64(t, 62)
-		t = a[18] ^ d3
-		bc1 = bits.RotateLeft64(t, 55)
-		t = a[4] ^ d4
-		bc2 = bits.RotateLeft64(t, 39)
-		a[10] = bc0 ^ (bc2 &^ bc1)
-		a[21] = bc1 ^ (bc3 &^ bc2)
-		a[7] = bc2 ^ (bc4 &^ bc3)
-		a[18] = bc3 ^ (bc0 &^ bc4)
-		a[4] = bc4 ^ (bc1 &^ bc0)
-
-		// Round 4
-		bc0 = a[0] ^ a[5] ^ a[10] ^ a[15] ^ a[20]
-		bc1 = a[1] ^ a[6] ^ a[11] ^ a[16] ^ a[21]
-		bc2 = a[2] ^ a[7] ^ a[12] ^ a[17] ^ a[22]
-		bc3 = a[3] ^ a[8] ^ a[13] ^ a[18] ^ a[23]
-		bc4 = a[4] ^ a[9] ^ a[14] ^ a[19] ^ a[24]
-		d0 = bc4 ^ (bc1<<1 | bc1>>63)
-		d1 = bc0 ^ (bc2<<1 | bc2>>63)
-		d2 = bc1 ^ (bc3<<1 | bc3>>63)
-		d3 = bc2 ^ (bc4<<1 | bc4>>63)
-		d4 = bc3 ^ (bc0<<1 | bc0>>63)
-
-		bc0 = a[0] ^ d0
-		t = a[1] ^ d1
-		bc1 = bits.RotateLeft64(t, 44)
-		t = a[2] ^ d2
-		bc2 = bits.RotateLeft64(t, 43)
-		t = a[3] ^ d3
-		bc3 = bits.RotateLeft64(t, 21)
-		t = a[4] ^ d4
-		bc4 = bits.RotateLeft64(t, 14)
-		a[0] = bc0 ^ (bc2 &^ bc1) ^ rc[i+3]
-		a[1] = bc1 ^ (bc3 &^ bc2)
-		a[2] = bc2 ^ (bc4 &^ bc3)
-		a[3] = bc3 ^ (bc0 &^ bc4)
-		a[4] = bc4 ^ (bc1 &^ bc0)
-
-		t = a[5] ^ d0
-		bc2 = bits.RotateLeft64(t, 3)
-		t = a[6] ^ d1
-		bc3 = bits.RotateLeft64(t, 45)
-		t = a[7] ^ d2
-		bc4 = bits.RotateLeft64(t, 61)
-		t = a[8] ^ d3
-		bc0 = bits.RotateLeft64(t, 28)
-		t = a[9] ^ d4
-		bc1 = bits.RotateLeft64(t, 20)
-		a[5] = bc0 ^ (bc2 &^ bc1)
-		a[6] = bc1 ^ (bc3 &^ bc2)
-		a[7] = bc2 ^ (bc4 &^ bc3)
-		a[8] = bc3 ^ (bc0 &^ bc4)
-		a[9] = bc4 ^ (bc1 &^ bc0)
-
-		t = a[10] ^ d0
-		bc4 = bits.RotateLeft64(t, 18)
-		t = a[11] ^ d1
-		bc0 = bits.RotateLeft64(t, 1)
-		t = a[12] ^ d2
-		bc1 = bits.RotateLeft64(t, 6)
-		t = a[13] ^ d3
-		bc2 = bits.RotateLeft64(t, 25)
-		t = a[14] ^ d4
-		bc3 = bits.RotateLeft64(t, 8)
-		a[10] = bc0 ^ (bc2 &^ bc1)
-		a[11] = bc1 ^ (bc3 &^ bc2)
-		a[12] = bc2 ^ (bc4 &^ bc3)
-		a[13] = bc3 ^ (bc0 &^ bc4)
-		a[14] = bc4 ^ (bc1 &^ bc0)
-
-		t = a[15] ^ d0
-		bc1 = bits.RotateLeft64(t, 36)
-		t = a[16] ^ d1
-		bc2 = bits.RotateLeft64(t, 10)
-		t = a[17] ^ d2
-		bc3 = bits.RotateLeft64(t, 15)
-		t = a[18] ^ d3
-		bc4 = bits.RotateLeft64(t, 56)
-		t = a[19] ^ d4
-		bc0 = bits.RotateLeft64(t, 27)
-		a[15] = bc0 ^ (bc2 &^ bc1)
-		a[16] = bc1 ^ (bc3 &^ bc2)
-		a[17] = bc2 ^ (bc4 &^ bc3)
-		a[18] = bc3 ^ (bc0 &^ bc4)
-		a[19] = bc4 ^ (bc1 &^ bc0)
-
-		t = a[20] ^ d0
-		bc3 = bits.RotateLeft64(t, 41)
-		t = a[21] ^ d1
-		bc4 = bits.RotateLeft64(t, 2)
-		t = a[22] ^ d2
-		bc0 = bits.RotateLeft64(t, 62)
-		t = a[23] ^ d3
-		bc1 = bits.RotateLeft64(t, 55)
-		t = a[24] ^ d4
-		bc2 = bits.RotateLeft64(t, 39)
-		a[20] = bc0 ^ (bc2 &^ bc1)
-		a[21] = bc1 ^ (bc3 &^ bc2)
-		a[22] = bc2 ^ (bc4 &^ bc3)
-		a[23] = bc3 ^ (bc0 &^ bc4)
-		a[24] = bc4 ^ (bc1 &^ bc0)
-	}
-}
--- a/crypto/keccak/keccakf_amd64.go
+++ b/crypto/keccak/keccakf_amd64.go
@ -1,13 +1,21 @@
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build amd64 && !purego && gc
+//go:build amd64 && !purego

 package keccak

-// This function is implemented in keccakf_amd64.s.
+import "golang.org/x/sys/cpu"

+func init() { useASM = cpu.X86.HasBMI1 && cpu.X86.HasBMI2 }
+
+// keccakF1600BMI2 permutes state. When buf != nil, it first XORs rate bytes
+// of buf into state, saving one full memory pass.
+//
 //go:noescape
+func keccakF1600BMI2(a *[200]byte, buf *byte)

-func keccakF1600(a *[25]uint64)
+func keccakF1600(a *[200]byte) {
+	keccakF1600BMI2(a, nil)
+}
+
+func xorAndPermute(state *[200]byte, buf *byte) {
+	keccakF1600BMI2(state, buf)
+}
--- a/crypto/keccak/keccakf_amd64.s
+++ b/crypto/keccak/keccakf_amd64.s
--- a/crypto/keccak/keccakf_amd64_bmi2.s
+++ b/crypto/keccak/keccakf_amd64_bmi2.s
--- a/crypto/keccak/sha3.go
+++ b/crypto/keccak/sha3.go
@ -1,244 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package keccak
-
-import (
-	"crypto/subtle"
-	"encoding/binary"
-	"errors"
-	"unsafe"
-
-	"golang.org/x/sys/cpu"
-)
-
-// spongeDirection indicates the direction bytes are flowing through the sponge.
-type spongeDirection int
-
-const (
-	// spongeAbsorbing indicates that the sponge is absorbing input.
-	spongeAbsorbing spongeDirection = iota
-	// spongeSqueezing indicates that the sponge is being squeezed.
-	spongeSqueezing
-)
-
-type state struct {
-	a [1600 / 8]byte // main state of the hash
-
-	// a[n:rate] is the buffer. If absorbing, it's the remaining space to XOR
-	// into before running the permutation. If squeezing, it's the remaining
-	// output to produce before running the permutation.
-	n, rate int
-
-	// dsbyte contains the "domain separation" bits and the first bit of
-	// the padding. Sections 6.1 and 6.2 of [1] separate the outputs of the
-	// SHA-3 and SHAKE functions by appending bitstrings to the message.
-	// Using a little-endian bit-ordering convention, these are "01" for SHA-3
-	// and "1111" for SHAKE, or 00000010b and 00001111b, respectively. Then the
-	// padding rule from section 5.1 is applied to pad the message to a multiple
-	// of the rate, which involves adding a "1" bit, zero or more "0" bits, and
-	// a final "1" bit. We merge the first "1" bit from the padding into dsbyte,
-	// giving 00000110b (0x06) and 00011111b (0x1f).
-	// [1] http://csrc.nist.gov/publications/drafts/fips-202/fips_202_draft.pdf
-	//     "Draft FIPS 202: SHA-3 Standard: Permutation-Based Hash and
-	//      Extendable-Output Functions (May 2014)"
-	dsbyte byte
-
-	outputLen int             // the default output size in bytes
-	state     spongeDirection // whether the sponge is absorbing or squeezing
-}
-
-// BlockSize returns the rate of sponge underlying this hash function.
-func (d *state) BlockSize() int { return d.rate }
-
-// Size returns the output size of the hash function in bytes.
-func (d *state) Size() int { return d.outputLen }
-
-// Reset clears the internal state by zeroing the sponge state and
-// the buffer indexes, and setting Sponge.state to absorbing.
-func (d *state) Reset() {
-	// Zero the permutation's state.
-	for i := range d.a {
-		d.a[i] = 0
-	}
-	d.state = spongeAbsorbing
-	d.n = 0
-}
-
-func (d *state) clone() *state {
-	ret := *d
-	return &ret
-}
-
-// permute applies the KeccakF-1600 permutation.
-func (d *state) permute() {
-	var a *[25]uint64
-	if cpu.IsBigEndian {
-		a = new([25]uint64)
-		for i := range a {
-			a[i] = binary.LittleEndian.Uint64(d.a[i*8:])
-		}
-	} else {
-		a = (*[25]uint64)(unsafe.Pointer(&d.a))
-	}
-
-	keccakF1600(a)
-	d.n = 0
-
-	if cpu.IsBigEndian {
-		for i := range a {
-			binary.LittleEndian.PutUint64(d.a[i*8:], a[i])
-		}
-	}
-}
-
-// pads appends the domain separation bits in dsbyte, applies
-// the multi-bitrate 10..1 padding rule, and permutes the state.
-func (d *state) padAndPermute() {
-	// Pad with this instance's domain-separator bits. We know that there's
-	// at least one byte of space in the sponge because, if it were full,
-	// permute would have been called to empty it. dsbyte also contains the
-	// first one bit for the padding. See the comment in the state struct.
-	d.a[d.n] ^= d.dsbyte
-	// This adds the final one bit for the padding. Because of the way that
-	// bits are numbered from the LSB upwards, the final bit is the MSB of
-	// the last byte.
-	d.a[d.rate-1] ^= 0x80
-	// Apply the permutation
-	d.permute()
-	d.state = spongeSqueezing
-}
-
-// Write absorbs more data into the hash's state. It panics if any
-// output has already been read.
-func (d *state) Write(p []byte) (n int, err error) {
-	if d.state != spongeAbsorbing {
-		panic("sha3: Write after Read")
-	}
-
-	n = len(p)
-
-	for len(p) > 0 {
-		x := subtle.XORBytes(d.a[d.n:d.rate], d.a[d.n:d.rate], p)
-		d.n += x
-		p = p[x:]
-
-		// If the sponge is full, apply the permutation.
-		if d.n == d.rate {
-			d.permute()
-		}
-	}
-
-	return
-}
-
-// Read squeezes an arbitrary number of bytes from the sponge.
-func (d *state) Read(out []byte) (n int, err error) {
-	// If we're still absorbing, pad and apply the permutation.
-	if d.state == spongeAbsorbing {
-		d.padAndPermute()
-	}
-
-	n = len(out)
-
-	// Now, do the squeezing.
-	for len(out) > 0 {
-		// Apply the permutation if we've squeezed the sponge dry.
-		if d.n == d.rate {
-			d.permute()
-		}
-
-		x := copy(out, d.a[d.n:d.rate])
-		d.n += x
-		out = out[x:]
-	}
-
-	return
-}
-
-// Sum applies padding to the hash state and then squeezes out the desired
-// number of output bytes. It panics if any output has already been read.
-func (d *state) Sum(in []byte) []byte {
-	if d.state != spongeAbsorbing {
-		panic("sha3: Sum after Read")
-	}
-
-	// Make a copy of the original hash so that caller can keep writing
-	// and summing.
-	dup := d.clone()
-	hash := make([]byte, dup.outputLen, 64) // explicit cap to allow stack allocation
-	dup.Read(hash)
-	return append(in, hash...)
-}
-
-const (
-	magicSHA3   = "sha\x08"
-	magicShake  = "sha\x09"
-	magicCShake = "sha\x0a"
-	magicKeccak = "sha\x0b"
-	// magic || rate || main state || n || sponge direction
-	marshaledSize = len(magicSHA3) + 1 + 200 + 1 + 1
-)
-
-func (d *state) MarshalBinary() ([]byte, error) {
-	return d.AppendBinary(make([]byte, 0, marshaledSize))
-}
-
-func (d *state) AppendBinary(b []byte) ([]byte, error) {
-	switch d.dsbyte {
-	case dsbyteSHA3:
-		b = append(b, magicSHA3...)
-	case dsbyteShake:
-		b = append(b, magicShake...)
-	case dsbyteCShake:
-		b = append(b, magicCShake...)
-	case dsbyteKeccak:
-		b = append(b, magicKeccak...)
-	default:
-		panic("unknown dsbyte")
-	}
-	// rate is at most 168, and n is at most rate.
-	b = append(b, byte(d.rate))
-	b = append(b, d.a[:]...)
-	b = append(b, byte(d.n), byte(d.state))
-	return b, nil
-}
-
-func (d *state) UnmarshalBinary(b []byte) error {
-	if len(b) != marshaledSize {
-		return errors.New("sha3: invalid hash state")
-	}
-
-	magic := string(b[:len(magicSHA3)])
-	b = b[len(magicSHA3):]
-	switch {
-	case magic == magicSHA3 && d.dsbyte == dsbyteSHA3:
-	case magic == magicShake && d.dsbyte == dsbyteShake:
-	case magic == magicCShake && d.dsbyte == dsbyteCShake:
-	case magic == magicKeccak && d.dsbyte == dsbyteKeccak:
-	default:
-		return errors.New("sha3: invalid hash state identifier")
-	}
-
-	rate := int(b[0])
-	b = b[1:]
-	if rate != d.rate {
-		return errors.New("sha3: invalid hash state function")
-	}
-
-	copy(d.a[:], b)
-	b = b[len(d.a):]
-
-	n, state := int(b[0]), spongeDirection(b[1])
-	if n > d.rate {
-		return errors.New("sha3: invalid hash state")
-	}
-	d.n = n
-	if state != spongeAbsorbing && state != spongeSqueezing {
-		return errors.New("sha3: invalid hash state")
-	}
-	d.state = state
-
-	return nil
-}
--- a/crypto/keccak/sha3_test.go
+++ b/crypto/keccak/sha3_test.go
@ -1,210 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package keccak
-
-// Tests include all the ShortMsgKATs provided by the Keccak team at
-// https://github.com/gvanas/KeccakCodePackage
-//
-// They only include the zero-bit case of the bitwise testvectors
-// published by NIST in the draft of FIPS-202.
-
-import (
-	"bytes"
-	"compress/flate"
-	"encoding"
-	"encoding/hex"
-	"encoding/json"
-	"hash"
-	"math/rand"
-	"os"
-	"strings"
-	"testing"
-)
-
-const (
-	testString  = "brekeccakkeccak koax koax"
-	katFilename = "testdata/keccakKats.json.deflate"
-)
-
-// testDigests contains functions returning hash.Hash instances
-// with output-length equal to the KAT length for SHA-3, Keccak
-// and SHAKE instances.
-var testDigests = map[string]func() hash.Hash{
-	"Keccak-256": NewLegacyKeccak256,
-	"Keccak-512": NewLegacyKeccak512,
-}
-
-// decodeHex converts a hex-encoded string into a raw byte string.
-func decodeHex(s string) []byte {
-	b, err := hex.DecodeString(s)
-	if err != nil {
-		panic(err)
-	}
-	return b
-}
-
-// structs used to marshal JSON test-cases.
-type KeccakKats struct {
-	Kats map[string][]struct {
-		Digest  string `json:"digest"`
-		Length  int64  `json:"length"`
-		Message string `json:"message"`
-
-		// Defined only for cSHAKE
-		N string `json:"N"`
-		S string `json:"S"`
-	}
-}
-
-// TestKeccakKats tests the SHA-3 and Shake implementations against all the
-// ShortMsgKATs from https://github.com/gvanas/KeccakCodePackage
-// (The testvectors are stored in keccakKats.json.deflate due to their length.)
-func TestKeccakKats(t *testing.T) {
-	// Read the KATs.
-	deflated, err := os.Open(katFilename)
-	if err != nil {
-		t.Errorf("error opening %s: %s", katFilename, err)
-	}
-	file := flate.NewReader(deflated)
-	dec := json.NewDecoder(file)
-	var katSet KeccakKats
-	err = dec.Decode(&katSet)
-	if err != nil {
-		t.Errorf("error decoding KATs: %s", err)
-	}
-
-	for algo, function := range testDigests {
-		d := function()
-		for _, kat := range katSet.Kats[algo] {
-			d.Reset()
-			in, err := hex.DecodeString(kat.Message)
-			if err != nil {
-				t.Errorf("error decoding KAT: %s", err)
-			}
-			d.Write(in[:kat.Length/8])
-			got := strings.ToUpper(hex.EncodeToString(d.Sum(nil)))
-			if got != kat.Digest {
-				t.Errorf("function=%s, length=%d\nmessage:\n %s\ngot:\n  %s\nwanted:\n %s",
-					algo, kat.Length, kat.Message, got, kat.Digest)
-				t.Logf("wanted %+v", kat)
-				t.FailNow()
-			}
-			continue
-		}
-	}
-}
-
-// TestKeccak does a basic test of the non-standardized Keccak hash functions.
-func TestKeccak(t *testing.T) {
-	tests := []struct {
-		fn   func() hash.Hash
-		data []byte
-		want string
-	}{
-		{
-			NewLegacyKeccak256,
-			[]byte("abc"),
-			"4e03657aea45a94fc7d47ba826c8d667c0d1e6e33a64a036ec44f58fa12d6c45",
-		},
-		{
-			NewLegacyKeccak512,
-			[]byte("abc"),
-			"18587dc2ea106b9a1563e32b3312421ca164c7f1f07bc922a9c83d77cea3a1e5d0c69910739025372dc14ac9642629379540c17e2a65b19d77aa511a9d00bb96",
-		},
-	}
-
-	for _, u := range tests {
-		h := u.fn()
-		h.Write(u.data)
-		got := h.Sum(nil)
-		want := decodeHex(u.want)
-		if !bytes.Equal(got, want) {
-			t.Errorf("unexpected hash for size %d: got '%x' want '%s'", h.Size()*8, got, u.want)
-		}
-	}
-}
-
-// TestUnalignedWrite tests that writing data in an arbitrary pattern with
-// small input buffers.
-func TestUnalignedWrite(t *testing.T) {
-	buf := sequentialBytes(0x10000)
-	for alg, df := range testDigests {
-		d := df()
-		d.Reset()
-		d.Write(buf)
-		want := d.Sum(nil)
-		d.Reset()
-		for i := 0; i < len(buf); {
-			// Cycle through offsets which make a 137 byte sequence.
-			// Because 137 is prime this sequence should exercise all corner cases.
-			offsets := [17]int{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1}
-			for _, j := range offsets {
-				if v := len(buf) - i; v < j {
-					j = v
-				}
-				d.Write(buf[i : i+j])
-				i += j
-			}
-		}
-		got := d.Sum(nil)
-		if !bytes.Equal(got, want) {
-			t.Errorf("Unaligned writes, alg=%s\ngot %q, want %q", alg, got, want)
-		}
-	}
-}
-
-// sequentialBytes produces a buffer of size consecutive bytes 0x00, 0x01, ..., used for testing.
-//
-// The alignment of each slice is intentionally randomized to detect alignment
-// issues in the implementation. See https://golang.org/issue/37644.
-// Ideally, the compiler should fuzz the alignment itself.
-// (See https://golang.org/issue/35128.)
-func sequentialBytes(size int) []byte {
-	alignmentOffset := rand.Intn(8)
-	result := make([]byte, size+alignmentOffset)[alignmentOffset:]
-	for i := range result {
-		result[i] = byte(i)
-	}
-	return result
-}
-
-func TestMarshalUnmarshal(t *testing.T) {
-	t.Run("Keccak-256", func(t *testing.T) { testMarshalUnmarshal(t, NewLegacyKeccak256()) })
-	t.Run("Keccak-512", func(t *testing.T) { testMarshalUnmarshal(t, NewLegacyKeccak512()) })
-}
-
-// TODO(filippo): move this to crypto/internal/cryptotest.
-func testMarshalUnmarshal(t *testing.T, h hash.Hash) {
-	buf := make([]byte, 200)
-	rand.Read(buf)
-	n := rand.Intn(200)
-	h.Write(buf)
-	want := h.Sum(nil)
-	h.Reset()
-	h.Write(buf[:n])
-	b, err := h.(encoding.BinaryMarshaler).MarshalBinary()
-	if err != nil {
-		t.Errorf("MarshalBinary: %v", err)
-	}
-	h.Write(bytes.Repeat([]byte{0}, 200))
-	if err := h.(encoding.BinaryUnmarshaler).UnmarshalBinary(b); err != nil {
-		t.Errorf("UnmarshalBinary: %v", err)
-	}
-	h.Write(buf[n:])
-	got := h.Sum(nil)
-	if !bytes.Equal(got, want) {
-		t.Errorf("got %x, want %x", got, want)
-	}
-}
-
-// BenchmarkPermutationFunction measures the speed of the permutation function
-// with no input data.
-func BenchmarkPermutationFunction(b *testing.B) {
-	b.SetBytes(int64(200))
-	var lanes [25]uint64
-	for i := 0; i < b.N; i++ {
-		keccakF1600(&lanes)
-	}
-}
--- a/crypto/keccak/testdata/keccakKats.json.deflate
+++ b/crypto/keccak/testdata/keccakKats.json.deflate
--- a/crypto/keccak/‎gen_keccakf_bmi2.go
+++ b/crypto/keccak/‎gen_keccakf_bmi2.go
@ -0,0 +1,175 @@
+//go:build ignore
+
+// gen_keccakf_bmi2.go generates keccakf_amd64_bmi2.s — a BMI2-optimized
+// Keccak-f[1600] permutation using RORXQ and ANDNQ.
+// Fully unrolled (all 24 rounds).
+//
+// Key optimizations:
+//   - D values kept in registers (R14, R15, BP, SI, DX), not on stack
+//   - State alternates between the original array (DI) and a 200-byte stack
+//     buffer, avoiding a second 200-byte copy
+//   - Frame is only 200 bytes (25 × 8 for temp state)
+//   - Optional XOR-and-permute: when buf != nil, XORs rate bytes into state
+//     before permuting, eliminating one full memory pass
+//
+// Usage: go run gen_keccakf_bmi2.go
+
+package main
+
+import (
+	"fmt"
+	"os"
+)
+
+var rc = [24]uint64{
+	0x0000000000000001, 0x0000000000008082,
+	0x800000000000808a, 0x8000000080008000,
+	0x000000000000808b, 0x0000000080000001,
+	0x8000000080008081, 0x8000000000008009,
+	0x000000000000008a, 0x0000000000000088,
+	0x0000000080008009, 0x000000008000000a,
+	0x000000008000808b, 0x800000000000008b,
+	0x8000000000008089, 0x8000000000008003,
+	0x8000000000008002, 0x8000000000000080,
+	0x000000000000800a, 0x800000008000000a,
+	0x8000000080008081, 0x8000000000008080,
+	0x0000000080000001, 0x8000000080008008,
+}
+
+type lane struct {
+	idx int // state lane index (0–24)
+	rot int // left-rotation amount
+}
+
+// Chi groups: each group reads 5 lanes (after theta+rho+pi)
+// and produces 5 consecutive output lanes.
+var groups = [5][5]lane{
+	{{0, 0}, {6, 44}, {12, 43}, {18, 21}, {24, 14}},  // → lanes 0–4
+	{{3, 28}, {9, 20}, {10, 3}, {16, 45}, {22, 61}},  // → lanes 5–9
+	{{1, 1}, {7, 6}, {13, 25}, {19, 8}, {20, 18}},    // → lanes 10–14
+	{{4, 27}, {5, 36}, {11, 10}, {17, 15}, {23, 56}}, // → lanes 15–19
+	{{2, 62}, {8, 55}, {14, 39}, {15, 41}, {21, 2}},  // → lanes 20–24
+}
+
+// D-value registers, indexed by lane%5.
+var dReg = [5]string{"R14", "R15", "BP", "SI", "DX"}
+
+const (
+	fsize     = 200
+	rateLanes = 17 // rate / 8 = 136 / 8 = 17 lanes
+)
+
+var p func(string, ...any)
+
+func main() {
+	f, err := os.Create("keccakf_amd64_bmi2.s")
+	if err != nil {
+		panic(err)
+	}
+	defer f.Close()
+	p = func(format string, args ...any) { fmt.Fprintf(f, format+"\n", args...) }
+
+	p("// Code generated by gen_keccakf_bmi2.go. DO NOT EDIT.")
+	p("")
+	p("//go:build amd64 && !purego")
+	p("")
+	p("#include \"textflag.h\"")
+	p("")
+
+	// Single function: keccakF1600BMI2(a *[200]byte, buf *byte)
+	// When buf != nil, XORs rate bytes into state before permuting.
+	// When buf == nil, just permutes.
+	p("// func keccakF1600BMI2(a *[200]byte, buf *byte)")
+	p("TEXT ·keccakF1600BMI2(SB), NOSPLIT, $%d-16", fsize)
+	p("\tMOVQ a+0(FP), DI")
+	p("\tMOVQ buf+8(FP), BX")
+	p("\tTESTQ BX, BX")
+	p("\tJZ rounds")
+	p("")
+	p("\t// XOR %d lanes (%d bytes) of buf into state.", rateLanes, rateLanes*8)
+	for i := 0; i < rateLanes; i++ {
+		p("\tMOVQ %d(BX), AX", i*8)
+		p("\tXORQ AX, %d(DI)", i*8)
+	}
+	p("")
+	p("rounds:")
+
+	for round := 0; round < 24; round++ {
+		p("")
+		p("\t// Round %d", round)
+		srcArray := (round % 2) == 0
+		emitRound(srcArray, round)
+	}
+
+	p("\tRET")
+}
+
+// srcArray: true = source is array (DI), dest is stack (SP)
+//
+//	false = source is stack (SP), dest is array (DI)
+func emitRound(srcArray bool, round int) {
+	// Load round constant into R13.
+	p("\tMOVQ $0x%016x, R13", rc[round])
+
+	// Theta: 5 column parities → AX, BX, CX, DX, SI.
+	colR := [5]string{"AX", "BX", "CX", "DX", "SI"}
+	for c := 0; c < 5; c++ {
+		p("\tMOVQ %s, %s", off(c, srcArray), colR[c])
+		for r := 1; r < 5; r++ {
+			p("\tXORQ %s, %s", off(r*5+c, srcArray), colR[c])
+		}
+	}
+
+	// D values: D[x] = C[(x+4)%5] ^ rol(C[(x+1)%5], 1).
+	// D[0..2] go directly into R14, R15, BP (no conflicts).
+	for _, x := range []int{0, 1, 2} {
+		p("\tRORXQ $63, %s, %s", colR[(x+1)%5], dReg[x])
+		p("\tXORQ %s, %s", colR[(x+4)%5], dReg[x])
+	}
+	// D[3] and D[4] target SI and DX, which still hold column parities
+	// C[4] and C[3] needed as inputs, so compute via temps first.
+	p("\tRORXQ $63, SI, R8")
+	p("\tXORQ CX, R8")
+	p("\tRORXQ $63, AX, R9")
+	p("\tXORQ DX, R9")
+	p("\tMOVQ R8, SI") // SI = D[3]
+	p("\tMOVQ R9, DX") // DX = D[4]
+
+	// Five chi groups.
+	for g := 0; g < 5; g++ {
+		emitChi(g, srcArray, g == 0)
+	}
+}
+
+func emitChi(g int, srcArray, first bool) {
+	B := [5]string{"R8", "R9", "R10", "R11", "R12"}
+
+	// Load lane, XOR with D (register!), rotate.
+	for i := 0; i < 5; i++ {
+		l := groups[g][i]
+		p("\tMOVQ %s, %s", off(l.idx, srcArray), B[i])
+		p("\tXORQ %s, %s", dReg[l.idx%5], B[i])
+		if l.rot != 0 {
+			p("\tRORXQ $%d, %s, %s", 64-l.rot, B[i], B[i])
+		}
+	}
+
+	// Chi: out[j] = B[j] ^ (~B[(j+1)%5] & B[(j+2)%5]).
+	for j := 0; j < 5; j++ {
+		p("\tANDNQ %s, %s, AX", B[(j+2)%5], B[(j+1)%5])
+		p("\tXORQ %s, AX", B[j])
+		if first && j == 0 {
+			p("\tXORQ R13, AX")
+		}
+		p("\tMOVQ AX, %s", off(g*5+j, !srcArray))
+	}
+}
+
+// off returns the memory operand for lane idx.
+func off(idx int, array bool) string {
+	o := idx * 8
+	if array {
+		return fmt.Sprintf("%d(DI)", o)
+	}
+	return fmt.Sprintf("%d(SP)", o)
+}
--- a/crypto/keccak_ziren.go
+++ b/crypto/keccak_ziren.go
@ -21,6 +21,7 @@ package crypto
 import (
 	"github.com/ProjectZKM/Ziren/crates/go-runtime/zkvm_runtime"
 	"github.com/ethereum/go-ethereum/common"
+	"github.com/ethereum/go-ethereum/crypto/keccak"
 )

 // zirenKeccakState implements the KeccakState interface using the Ziren zkvm_runtime.
@ -31,7 +32,7 @@ type zirenKeccakState struct {
 	dirty  bool   // whether new data has been written since last hash
 }

-func newZirenKeccakState() KeccakState {
+func newZirenKeccakState() keccak.KeccakState {
 	return &zirenKeccakState{
 		buf: make([]byte, 0, 512), // pre-allocate reasonable capacity
 	}
@ -85,7 +86,7 @@ func (s *zirenKeccakState) computeHashIfNeeded() {

 // NewKeccakState creates a new KeccakState
 // This uses a Ziren-optimized implementation that leverages the zkvm_runtime.Keccak256 system call.
-func NewKeccakState() KeccakState {
+func NewKeccakState() keccak.KeccakState {
 	return newZirenKeccakState()
 }

--- a/trie/hasher.go
+++ b/trie/hasher.go
@ -22,13 +22,14 @@ import (
 	"sync"

 	"github.com/ethereum/go-ethereum/crypto"
+	"github.com/ethereum/go-ethereum/crypto/keccak"
 	"github.com/ethereum/go-ethereum/rlp"
 )

 // hasher is a type used for the trie Hash operation. A hasher has some
 // internal preallocated temp space
 type hasher struct {
-	sha      crypto.KeccakState
+	sha      keccak.KeccakState
 	tmp      []byte
 	encbuf   rlp.EncoderBuffer
 	parallel bool // Whether to use parallel threads when hashing