mirror of
https://github.com/ethereum/go-ethereum.git
synced 2026-05-24 16:59:26 +00:00
crypto/keccak: add BMI2 keccak-f[1600] implementation for amd64
This commit is contained in:
parent
dbb657241f
commit
73cedabb8d
7 changed files with 4872 additions and 5527 deletions
|
|
@ -5,105 +5,18 @@ package keccak
|
|||
import (
|
||||
"runtime"
|
||||
|
||||
"golang.org/x/crypto/sha3"
|
||||
"golang.org/x/sys/cpu"
|
||||
)
|
||||
|
||||
// Apple Silicon always has Armv8.2-A SHA3 extensions (VEOR3, VRAX1, VXAR, VBCAX).
|
||||
// On other ARM64 platforms, detect at runtime via CPU feature flags.
|
||||
// When SHA3 is unavailable, falls back to x/crypto/sha3.
|
||||
var useSHA3 = runtime.GOOS == "darwin" || runtime.GOOS == "ios" || cpu.ARM64.HasSHA3
|
||||
func init() {
|
||||
useASM = runtime.GOOS == "darwin" || runtime.GOOS == "ios" || cpu.ARM64.HasSHA3
|
||||
}
|
||||
|
||||
//go:noescape
|
||||
func keccakF1600(a *[200]byte)
|
||||
|
||||
//go:noescape
|
||||
func xorAndPermute(state *[200]byte, buf *byte)
|
||||
|
||||
// Sum256 computes the Keccak-256 hash of data. Zero heap allocations when SHA3 is available.
|
||||
func Sum256(data []byte) [32]byte {
|
||||
if !useSHA3 {
|
||||
return sum256XCrypto(data)
|
||||
}
|
||||
return sum256Sponge(data)
|
||||
}
|
||||
|
||||
func sum256XCrypto(data []byte) [32]byte {
|
||||
h := sha3.NewLegacyKeccak256()
|
||||
h.Write(data)
|
||||
var out [32]byte
|
||||
h.Sum(out[:0])
|
||||
return out
|
||||
}
|
||||
|
||||
// Hasher is a streaming Keccak-256 hasher.
|
||||
// Uses NEON SHA3 assembly when available, x/crypto/sha3 otherwise.
|
||||
type Hasher struct {
|
||||
sponge
|
||||
xc KeccakState // x/crypto fallback
|
||||
}
|
||||
|
||||
// Reset resets the hasher to its initial state.
|
||||
func (h *Hasher) Reset() {
|
||||
if useSHA3 {
|
||||
h.sponge.Reset()
|
||||
} else {
|
||||
if h.xc == nil {
|
||||
h.xc = sha3.NewLegacyKeccak256().(KeccakState)
|
||||
} else {
|
||||
h.xc.Reset()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Write absorbs data into the hasher.
|
||||
// Panics if called after Read.
|
||||
func (h *Hasher) Write(p []byte) (int, error) {
|
||||
if !useSHA3 {
|
||||
if h.xc == nil {
|
||||
h.xc = sha3.NewLegacyKeccak256().(KeccakState)
|
||||
}
|
||||
return h.xc.Write(p)
|
||||
}
|
||||
return h.sponge.Write(p)
|
||||
}
|
||||
|
||||
// Sum256 finalizes and returns the 32-byte Keccak-256 digest.
|
||||
// Does not modify the hasher state.
|
||||
func (h *Hasher) Sum256() [32]byte {
|
||||
if !useSHA3 {
|
||||
if h.xc == nil {
|
||||
return Sum256(nil)
|
||||
}
|
||||
var out [32]byte
|
||||
h.xc.Sum(out[:0])
|
||||
return out
|
||||
}
|
||||
return h.sponge.Sum256()
|
||||
}
|
||||
|
||||
// Sum appends the current Keccak-256 digest to b and returns the resulting slice.
|
||||
// Does not modify the hasher state.
|
||||
func (h *Hasher) Sum(b []byte) []byte {
|
||||
if !useSHA3 {
|
||||
if h.xc == nil {
|
||||
d := Sum256(nil)
|
||||
return append(b, d[:]...)
|
||||
}
|
||||
return h.xc.Sum(b)
|
||||
}
|
||||
return h.sponge.Sum(b)
|
||||
}
|
||||
|
||||
// Read squeezes an arbitrary number of bytes from the sponge.
|
||||
// On the first call, it pads and permutes, transitioning from absorbing to squeezing.
|
||||
// Subsequent calls to Write will panic. It never returns an error.
|
||||
func (h *Hasher) Read(out []byte) (int, error) {
|
||||
if !useSHA3 {
|
||||
if h.xc == nil {
|
||||
h.xc = sha3.NewLegacyKeccak256().(KeccakState)
|
||||
}
|
||||
return h.xc.Read(out)
|
||||
}
|
||||
return h.sponge.Read(out)
|
||||
}
|
||||
|
|
@ -2,7 +2,15 @@
|
|||
|
||||
package keccak
|
||||
|
||||
import "unsafe"
|
||||
import (
|
||||
"unsafe"
|
||||
|
||||
"golang.org/x/crypto/sha3"
|
||||
)
|
||||
|
||||
// useASM is set by platform-specific init to indicate hardware acceleration is available.
|
||||
// When false, Sum256 and Hasher fall back to x/crypto/sha3.
|
||||
var useASM bool
|
||||
|
||||
// sponge is the core Keccak-256 sponge state used by native (asm) implementations.
|
||||
type sponge struct {
|
||||
|
|
@ -120,6 +128,95 @@ func sum256Sponge(data []byte) [32]byte {
|
|||
return [32]byte(state[:32])
|
||||
}
|
||||
|
||||
// Sum256 computes the Keccak-256 hash of data. Zero heap allocations when hardware
|
||||
// acceleration is available.
|
||||
func Sum256(data []byte) [32]byte {
|
||||
if !useASM {
|
||||
return sum256XCrypto(data)
|
||||
}
|
||||
return sum256Sponge(data)
|
||||
}
|
||||
|
||||
func sum256XCrypto(data []byte) [32]byte {
|
||||
h := sha3.NewLegacyKeccak256()
|
||||
h.Write(data)
|
||||
var out [32]byte
|
||||
h.Sum(out[:0])
|
||||
return out
|
||||
}
|
||||
|
||||
// Hasher is a streaming Keccak-256 hasher.
|
||||
// Uses platform assembly when available, x/crypto/sha3 otherwise.
|
||||
type Hasher struct {
|
||||
sponge
|
||||
xc KeccakState // x/crypto fallback
|
||||
}
|
||||
|
||||
// Reset resets the hasher to its initial state.
|
||||
func (h *Hasher) Reset() {
|
||||
if useASM {
|
||||
h.sponge.Reset()
|
||||
} else {
|
||||
if h.xc == nil {
|
||||
h.xc = sha3.NewLegacyKeccak256().(KeccakState)
|
||||
} else {
|
||||
h.xc.Reset()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Write absorbs data into the hasher.
|
||||
// Panics if called after Read.
|
||||
func (h *Hasher) Write(p []byte) (int, error) {
|
||||
if !useASM {
|
||||
if h.xc == nil {
|
||||
h.xc = sha3.NewLegacyKeccak256().(KeccakState)
|
||||
}
|
||||
return h.xc.Write(p)
|
||||
}
|
||||
return h.sponge.Write(p)
|
||||
}
|
||||
|
||||
// Sum256 finalizes and returns the 32-byte Keccak-256 digest.
|
||||
// Does not modify the hasher state.
|
||||
func (h *Hasher) Sum256() [32]byte {
|
||||
if !useASM {
|
||||
if h.xc == nil {
|
||||
return Sum256(nil)
|
||||
}
|
||||
var out [32]byte
|
||||
h.xc.Sum(out[:0])
|
||||
return out
|
||||
}
|
||||
return h.sponge.Sum256()
|
||||
}
|
||||
|
||||
// Sum appends the current Keccak-256 digest to b and returns the resulting slice.
|
||||
// Does not modify the hasher state.
|
||||
func (h *Hasher) Sum(b []byte) []byte {
|
||||
if !useASM {
|
||||
if h.xc == nil {
|
||||
d := Sum256(nil)
|
||||
return append(b, d[:]...)
|
||||
}
|
||||
return h.xc.Sum(b)
|
||||
}
|
||||
return h.sponge.Sum(b)
|
||||
}
|
||||
|
||||
// Read squeezes an arbitrary number of bytes from the sponge.
|
||||
// On the first call, it pads and permutes, transitioning from absorbing to squeezing.
|
||||
// Subsequent calls to Write will panic. It never returns an error.
|
||||
func (h *Hasher) Read(out []byte) (int, error) {
|
||||
if !useASM {
|
||||
if h.xc == nil {
|
||||
h.xc = sha3.NewLegacyKeccak256().(KeccakState)
|
||||
}
|
||||
return h.xc.Read(out)
|
||||
}
|
||||
return h.sponge.Read(out)
|
||||
}
|
||||
|
||||
func xorIn(state *[200]byte, data []byte) {
|
||||
stateU64 := (*[25]uint64)(unsafe.Pointer(state))
|
||||
n := len(data) >> 3
|
||||
|
|
|
|||
|
|
@ -267,13 +267,14 @@ func benchName(size int) string {
|
|||
}
|
||||
}
|
||||
|
||||
func BenchmarkFasterKeccak(b *testing.B) {
|
||||
// BenchmarkKeccak256Sum tests Sum256 with local faster_keccak implementation.
|
||||
func BenchmarkKeccak256Sum(b *testing.B) {
|
||||
for _, size := range benchSizes {
|
||||
data := make([]byte, size)
|
||||
for i := range data {
|
||||
data[i] = byte(i)
|
||||
}
|
||||
b.Run(benchName(size), func(b *testing.B) {
|
||||
b.Run("FasterKeccak/"+benchName(size), func(b *testing.B) {
|
||||
b.SetBytes(int64(size))
|
||||
b.ReportAllocs()
|
||||
for b.Loop() {
|
||||
|
|
@ -283,13 +284,14 @@ func BenchmarkFasterKeccak(b *testing.B) {
|
|||
}
|
||||
}
|
||||
|
||||
func BenchmarkXCrypto(b *testing.B) {
|
||||
// BenchmarkKeccak256Stdlib tests Sum256 with golang.org/x/crypto/sha3 standard library.
|
||||
func BenchmarkKeccak256Stdlib(b *testing.B) {
|
||||
for _, size := range benchSizes {
|
||||
data := make([]byte, size)
|
||||
for i := range data {
|
||||
data[i] = byte(i)
|
||||
}
|
||||
b.Run(benchName(size), func(b *testing.B) {
|
||||
b.Run("StdLib/"+benchName(size), func(b *testing.B) {
|
||||
b.SetBytes(int64(size))
|
||||
b.ReportAllocs()
|
||||
h := sha3.NewLegacyKeccak256()
|
||||
|
|
@ -302,13 +304,14 @@ func BenchmarkXCrypto(b *testing.B) {
|
|||
}
|
||||
}
|
||||
|
||||
func BenchmarkFasterKeccakHasher(b *testing.B) {
|
||||
// BenchmarkKeccak256Hasher tests Hasher.Sum256() with local faster_keccak implementation.
|
||||
func BenchmarkKeccak256Hasher(b *testing.B) {
|
||||
for _, size := range benchSizes {
|
||||
data := make([]byte, size)
|
||||
for i := range data {
|
||||
data[i] = byte(i)
|
||||
}
|
||||
b.Run(benchName(size), func(b *testing.B) {
|
||||
b.Run("FasterKeccak/"+benchName(size), func(b *testing.B) {
|
||||
b.SetBytes(int64(size))
|
||||
b.ReportAllocs()
|
||||
var h Hasher
|
||||
|
|
@ -321,13 +324,35 @@ func BenchmarkFasterKeccakHasher(b *testing.B) {
|
|||
}
|
||||
}
|
||||
|
||||
// BenchmarkKeccakStreaming_Sha3 benchmarks the standard sha3 streaming hasher (Reset+Write+Read).
|
||||
func BenchmarkKeccakStreaming_Sha3(b *testing.B) {
|
||||
// BenchmarkKeccak256HasherStdlib tests Hasher API with golang.org/x/crypto/sha3 standard library.
|
||||
func BenchmarkKeccak256HasherStdlib(b *testing.B) {
|
||||
for _, size := range benchSizes {
|
||||
data := make([]byte, size)
|
||||
for i := range data {
|
||||
data[i] = byte(i)
|
||||
}
|
||||
b.Run("StdLib/"+benchName(size), func(b *testing.B) {
|
||||
b.SetBytes(int64(size))
|
||||
b.ReportAllocs()
|
||||
h := sha3.NewLegacyKeccak256().(KeccakState)
|
||||
var buf [32]byte
|
||||
for b.Loop() {
|
||||
h.Reset()
|
||||
h.Write(data)
|
||||
h.Read(buf[:])
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// BenchmarkKeccakStreaming benchmarks the streaming hasher (Reset+Write+Read).
|
||||
// Use with benchstat: go test -bench=BenchmarkKeccakStreaming -benchmem ./... | benchstat
|
||||
func BenchmarkKeccakStreaming(b *testing.B) {
|
||||
data := make([]byte, 32)
|
||||
for i := range data {
|
||||
data[i] = byte(i)
|
||||
}
|
||||
h := sha3.NewLegacyKeccak256().(KeccakState)
|
||||
var h Hasher
|
||||
var buf [32]byte
|
||||
b.SetBytes(int64(len(data)))
|
||||
b.ReportAllocs()
|
||||
|
|
|
|||
|
|
@ -2,16 +2,20 @@
|
|||
|
||||
package keccak
|
||||
|
||||
import "unsafe"
|
||||
import (
|
||||
"unsafe"
|
||||
|
||||
"golang.org/x/sys/cpu"
|
||||
)
|
||||
|
||||
func init() { useASM = cpu.X86.HasBMI2 }
|
||||
|
||||
//go:noescape
|
||||
func keccakF1600(a *[200]byte)
|
||||
func keccakF1600BMI2(a *[200]byte)
|
||||
|
||||
// Sum256 computes the Keccak-256 hash of data. Zero heap allocations.
|
||||
func Sum256(data []byte) [32]byte { return sum256Sponge(data) }
|
||||
|
||||
// Hasher is a streaming Keccak-256 hasher. Designed for stack allocation.
|
||||
type Hasher struct{ sponge }
|
||||
func keccakF1600(a *[200]byte) {
|
||||
keccakF1600BMI2(a)
|
||||
}
|
||||
|
||||
func xorAndPermute(state *[200]byte, buf *byte) {
|
||||
xorIn(state, unsafe.Slice(buf, rate))
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
4570
crypto/keccak/keccakf_amd64_bmi2.s
Normal file
4570
crypto/keccak/keccakf_amd64_bmi2.s
Normal file
File diff suppressed because it is too large
Load diff
156
crypto/keccak/gen_keccakf_bmi2.go
Normal file
156
crypto/keccak/gen_keccakf_bmi2.go
Normal file
|
|
@ -0,0 +1,156 @@
|
|||
//go:build ignore
|
||||
|
||||
// gen_keccakf_bmi2.go generates keccakf_amd64_bmi2.s — a BMI2-optimized
|
||||
// Keccak-f[1600] permutation using RORXQ and ANDNQ.
|
||||
// Fully unrolled (all 24 rounds).
|
||||
//
|
||||
// Key optimizations:
|
||||
// - D values kept in registers (R14, R15, BP, SI, DX), not on stack
|
||||
// - State alternates between the original array (DI) and a 200-byte stack
|
||||
// buffer, avoiding a second 200-byte copy
|
||||
// - Frame is only 200 bytes (25 × 8 for temp state)
|
||||
//
|
||||
// Usage: go run gen_keccakf_bmi2.go
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
)
|
||||
|
||||
var rc = [24]uint64{
|
||||
0x0000000000000001, 0x0000000000008082,
|
||||
0x800000000000808a, 0x8000000080008000,
|
||||
0x000000000000808b, 0x0000000080000001,
|
||||
0x8000000080008081, 0x8000000000008009,
|
||||
0x000000000000008a, 0x0000000000000088,
|
||||
0x0000000080008009, 0x000000008000000a,
|
||||
0x000000008000808b, 0x800000000000008b,
|
||||
0x8000000000008089, 0x8000000000008003,
|
||||
0x8000000000008002, 0x8000000000000080,
|
||||
0x000000000000800a, 0x800000008000000a,
|
||||
0x8000000080008081, 0x8000000000008080,
|
||||
0x0000000080000001, 0x8000000080008008,
|
||||
}
|
||||
|
||||
type lane struct {
|
||||
idx int // state lane index (0–24)
|
||||
rot int // left-rotation amount
|
||||
}
|
||||
|
||||
// Chi groups: each group reads 5 lanes (after theta+rho+pi)
|
||||
// and produces 5 consecutive output lanes.
|
||||
var groups = [5][5]lane{
|
||||
{{0, 0}, {6, 44}, {12, 43}, {18, 21}, {24, 14}}, // → lanes 0–4
|
||||
{{3, 28}, {9, 20}, {10, 3}, {16, 45}, {22, 61}}, // → lanes 5–9
|
||||
{{1, 1}, {7, 6}, {13, 25}, {19, 8}, {20, 18}}, // → lanes 10–14
|
||||
{{4, 27}, {5, 36}, {11, 10}, {17, 15}, {23, 56}}, // → lanes 15–19
|
||||
{{2, 62}, {8, 55}, {14, 39}, {15, 41}, {21, 2}}, // → lanes 20–24
|
||||
}
|
||||
|
||||
// D-value registers, indexed by lane%5.
|
||||
var dReg = [5]string{"R14", "R15", "BP", "SI", "DX"}
|
||||
|
||||
const fsize = 200
|
||||
|
||||
var p func(string, ...any)
|
||||
|
||||
func main() {
|
||||
f, err := os.Create("keccakf_amd64_bmi2.s")
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
defer f.Close()
|
||||
p = func(format string, args ...any) { fmt.Fprintf(f, format+"\n", args...) }
|
||||
|
||||
p("// Code generated by gen_keccakf_bmi2.go. DO NOT EDIT.")
|
||||
p("")
|
||||
p("//go:build !purego")
|
||||
p("")
|
||||
p("#include \"textflag.h\"")
|
||||
p("")
|
||||
|
||||
// Function.
|
||||
p("// func keccakF1600BMI2(a *[200]byte)")
|
||||
p("TEXT ·keccakF1600BMI2(SB), NOSPLIT, $%d-8", fsize)
|
||||
p("\tMOVQ a+0(FP), DI")
|
||||
|
||||
for round := 0; round < 24; round++ {
|
||||
p("")
|
||||
p("\t// Round %d", round)
|
||||
srcArray := (round % 2) == 0
|
||||
emitRound(srcArray, round)
|
||||
}
|
||||
|
||||
p("\tRET")
|
||||
}
|
||||
|
||||
// srcArray: true = source is array (DI), dest is stack (SP)
|
||||
//
|
||||
// false = source is stack (SP), dest is array (DI)
|
||||
func emitRound(srcArray bool, round int) {
|
||||
// Load round constant into R13.
|
||||
p("\tMOVQ $0x%016x, R13", rc[round])
|
||||
|
||||
// Theta: 5 column parities → AX, BX, CX, DX, SI.
|
||||
colR := [5]string{"AX", "BX", "CX", "DX", "SI"}
|
||||
for c := 0; c < 5; c++ {
|
||||
p("\tMOVQ %s, %s", off(c, srcArray), colR[c])
|
||||
for r := 1; r < 5; r++ {
|
||||
p("\tXORQ %s, %s", off(r*5+c, srcArray), colR[c])
|
||||
}
|
||||
}
|
||||
|
||||
// D values: D[x] = C[(x+4)%5] ^ rol(C[(x+1)%5], 1).
|
||||
for _, x := range []int{0, 1, 2} {
|
||||
p("\tRORXQ $63, %s, %s", colR[(x+1)%5], dReg[x])
|
||||
p("\tXORQ %s, %s", colR[(x+4)%5], dReg[x])
|
||||
}
|
||||
// Do = CX ^ rol(SI, 1) → R8 temp, then move to SI
|
||||
p("\tRORXQ $63, SI, R8")
|
||||
p("\tXORQ CX, R8")
|
||||
// Du = DX ^ rol(AX, 1) → R9 temp, then move to DX
|
||||
p("\tRORXQ $63, AX, R9")
|
||||
p("\tXORQ DX, R9")
|
||||
p("\tMOVQ R8, SI") // SI = Do
|
||||
p("\tMOVQ R9, DX") // DX = Du
|
||||
|
||||
// Five chi groups.
|
||||
for g := 0; g < 5; g++ {
|
||||
emitChi(g, srcArray, g == 0)
|
||||
}
|
||||
}
|
||||
|
||||
func emitChi(g int, srcArray, first bool) {
|
||||
B := [5]string{"R8", "R9", "R10", "R11", "R12"}
|
||||
|
||||
// Load lane, XOR with D (register!), rotate.
|
||||
for i := 0; i < 5; i++ {
|
||||
l := groups[g][i]
|
||||
p("\tMOVQ %s, %s", off(l.idx, srcArray), B[i])
|
||||
p("\tXORQ %s, %s", dReg[l.idx%5], B[i])
|
||||
if l.rot != 0 {
|
||||
p("\tRORXQ $%d, %s, %s", 64-l.rot, B[i], B[i])
|
||||
}
|
||||
}
|
||||
|
||||
// Chi: out[j] = B[j] ^ (~B[(j+1)%5] & B[(j+2)%5]).
|
||||
for j := 0; j < 5; j++ {
|
||||
p("\tANDNQ %s, %s, AX", B[(j+2)%5], B[(j+1)%5])
|
||||
p("\tXORQ %s, AX", B[j])
|
||||
if first && j == 0 {
|
||||
p("\tXORQ R13, AX")
|
||||
}
|
||||
p("\tMOVQ AX, %s", off(g*5+j, !srcArray))
|
||||
}
|
||||
}
|
||||
|
||||
// off returns the memory operand for lane idx.
|
||||
func off(idx int, array bool) string {
|
||||
o := idx * 8
|
||||
if array {
|
||||
return fmt.Sprintf("%d(DI)", o)
|
||||
}
|
||||
return fmt.Sprintf("%d(SP)", o)
|
||||
}
|
||||
Loading…
Reference in a new issue