crypto/keccak: add BMI2 keccak-f[1600] implementation for amd64

This commit is contained in:
Sahil-4555 2026-03-14 10:17:05 +05:30 committed by Sahil Sojitra
parent dbb657241f
commit 73cedabb8d
7 changed files with 4872 additions and 5527 deletions

View file

@ -5,105 +5,18 @@ package keccak
import (
"runtime"
"golang.org/x/crypto/sha3"
"golang.org/x/sys/cpu"
)
// Apple Silicon always has Armv8.2-A SHA3 extensions (VEOR3, VRAX1, VXAR, VBCAX).
// On other ARM64 platforms, detect at runtime via CPU feature flags.
// When SHA3 is unavailable, falls back to x/crypto/sha3.
var useSHA3 = runtime.GOOS == "darwin" || runtime.GOOS == "ios" || cpu.ARM64.HasSHA3
func init() {
useASM = runtime.GOOS == "darwin" || runtime.GOOS == "ios" || cpu.ARM64.HasSHA3
}
//go:noescape
func keccakF1600(a *[200]byte)
//go:noescape
func xorAndPermute(state *[200]byte, buf *byte)
// Sum256 computes the Keccak-256 hash of data. Zero heap allocations when SHA3 is available.
func Sum256(data []byte) [32]byte {
if !useSHA3 {
return sum256XCrypto(data)
}
return sum256Sponge(data)
}
func sum256XCrypto(data []byte) [32]byte {
h := sha3.NewLegacyKeccak256()
h.Write(data)
var out [32]byte
h.Sum(out[:0])
return out
}
// Hasher is a streaming Keccak-256 hasher.
// Uses NEON SHA3 assembly when available, x/crypto/sha3 otherwise.
type Hasher struct {
sponge
xc KeccakState // x/crypto fallback
}
// Reset resets the hasher to its initial state.
func (h *Hasher) Reset() {
if useSHA3 {
h.sponge.Reset()
} else {
if h.xc == nil {
h.xc = sha3.NewLegacyKeccak256().(KeccakState)
} else {
h.xc.Reset()
}
}
}
// Write absorbs data into the hasher.
// Panics if called after Read.
func (h *Hasher) Write(p []byte) (int, error) {
if !useSHA3 {
if h.xc == nil {
h.xc = sha3.NewLegacyKeccak256().(KeccakState)
}
return h.xc.Write(p)
}
return h.sponge.Write(p)
}
// Sum256 finalizes and returns the 32-byte Keccak-256 digest.
// Does not modify the hasher state.
func (h *Hasher) Sum256() [32]byte {
if !useSHA3 {
if h.xc == nil {
return Sum256(nil)
}
var out [32]byte
h.xc.Sum(out[:0])
return out
}
return h.sponge.Sum256()
}
// Sum appends the current Keccak-256 digest to b and returns the resulting slice.
// Does not modify the hasher state.
func (h *Hasher) Sum(b []byte) []byte {
if !useSHA3 {
if h.xc == nil {
d := Sum256(nil)
return append(b, d[:]...)
}
return h.xc.Sum(b)
}
return h.sponge.Sum(b)
}
// Read squeezes an arbitrary number of bytes from the sponge.
// On the first call, it pads and permutes, transitioning from absorbing to squeezing.
// Subsequent calls to Write will panic. It never returns an error.
func (h *Hasher) Read(out []byte) (int, error) {
if !useSHA3 {
if h.xc == nil {
h.xc = sha3.NewLegacyKeccak256().(KeccakState)
}
return h.xc.Read(out)
}
return h.sponge.Read(out)
}

View file

@ -2,7 +2,15 @@
package keccak
import "unsafe"
import (
"unsafe"
"golang.org/x/crypto/sha3"
)
// useASM is set by platform-specific init to indicate hardware acceleration is available.
// When false, Sum256 and Hasher fall back to x/crypto/sha3.
var useASM bool
// sponge is the core Keccak-256 sponge state used by native (asm) implementations.
type sponge struct {
@ -120,6 +128,95 @@ func sum256Sponge(data []byte) [32]byte {
return [32]byte(state[:32])
}
// Sum256 computes the Keccak-256 hash of data. Zero heap allocations when hardware
// acceleration is available.
func Sum256(data []byte) [32]byte {
if !useASM {
return sum256XCrypto(data)
}
return sum256Sponge(data)
}
func sum256XCrypto(data []byte) [32]byte {
h := sha3.NewLegacyKeccak256()
h.Write(data)
var out [32]byte
h.Sum(out[:0])
return out
}
// Hasher is a streaming Keccak-256 hasher.
// Uses platform assembly when available, x/crypto/sha3 otherwise.
type Hasher struct {
sponge
xc KeccakState // x/crypto fallback
}
// Reset resets the hasher to its initial state.
func (h *Hasher) Reset() {
if useASM {
h.sponge.Reset()
} else {
if h.xc == nil {
h.xc = sha3.NewLegacyKeccak256().(KeccakState)
} else {
h.xc.Reset()
}
}
}
// Write absorbs data into the hasher.
// Panics if called after Read.
func (h *Hasher) Write(p []byte) (int, error) {
if !useASM {
if h.xc == nil {
h.xc = sha3.NewLegacyKeccak256().(KeccakState)
}
return h.xc.Write(p)
}
return h.sponge.Write(p)
}
// Sum256 finalizes and returns the 32-byte Keccak-256 digest.
// Does not modify the hasher state.
func (h *Hasher) Sum256() [32]byte {
if !useASM {
if h.xc == nil {
return Sum256(nil)
}
var out [32]byte
h.xc.Sum(out[:0])
return out
}
return h.sponge.Sum256()
}
// Sum appends the current Keccak-256 digest to b and returns the resulting slice.
// Does not modify the hasher state.
func (h *Hasher) Sum(b []byte) []byte {
if !useASM {
if h.xc == nil {
d := Sum256(nil)
return append(b, d[:]...)
}
return h.xc.Sum(b)
}
return h.sponge.Sum(b)
}
// Read squeezes an arbitrary number of bytes from the sponge.
// On the first call, it pads and permutes, transitioning from absorbing to squeezing.
// Subsequent calls to Write will panic. It never returns an error.
func (h *Hasher) Read(out []byte) (int, error) {
if !useASM {
if h.xc == nil {
h.xc = sha3.NewLegacyKeccak256().(KeccakState)
}
return h.xc.Read(out)
}
return h.sponge.Read(out)
}
func xorIn(state *[200]byte, data []byte) {
stateU64 := (*[25]uint64)(unsafe.Pointer(state))
n := len(data) >> 3

View file

@ -267,13 +267,14 @@ func benchName(size int) string {
}
}
func BenchmarkFasterKeccak(b *testing.B) {
// BenchmarkKeccak256Sum tests Sum256 with local faster_keccak implementation.
func BenchmarkKeccak256Sum(b *testing.B) {
for _, size := range benchSizes {
data := make([]byte, size)
for i := range data {
data[i] = byte(i)
}
b.Run(benchName(size), func(b *testing.B) {
b.Run("FasterKeccak/"+benchName(size), func(b *testing.B) {
b.SetBytes(int64(size))
b.ReportAllocs()
for b.Loop() {
@ -283,13 +284,14 @@ func BenchmarkFasterKeccak(b *testing.B) {
}
}
func BenchmarkXCrypto(b *testing.B) {
// BenchmarkKeccak256Stdlib tests Sum256 with golang.org/x/crypto/sha3 standard library.
func BenchmarkKeccak256Stdlib(b *testing.B) {
for _, size := range benchSizes {
data := make([]byte, size)
for i := range data {
data[i] = byte(i)
}
b.Run(benchName(size), func(b *testing.B) {
b.Run("StdLib/"+benchName(size), func(b *testing.B) {
b.SetBytes(int64(size))
b.ReportAllocs()
h := sha3.NewLegacyKeccak256()
@ -302,13 +304,14 @@ func BenchmarkXCrypto(b *testing.B) {
}
}
func BenchmarkFasterKeccakHasher(b *testing.B) {
// BenchmarkKeccak256Hasher tests Hasher.Sum256() with local faster_keccak implementation.
func BenchmarkKeccak256Hasher(b *testing.B) {
for _, size := range benchSizes {
data := make([]byte, size)
for i := range data {
data[i] = byte(i)
}
b.Run(benchName(size), func(b *testing.B) {
b.Run("FasterKeccak/"+benchName(size), func(b *testing.B) {
b.SetBytes(int64(size))
b.ReportAllocs()
var h Hasher
@ -321,13 +324,35 @@ func BenchmarkFasterKeccakHasher(b *testing.B) {
}
}
// BenchmarkKeccakStreaming_Sha3 benchmarks the standard sha3 streaming hasher (Reset+Write+Read).
func BenchmarkKeccakStreaming_Sha3(b *testing.B) {
// BenchmarkKeccak256HasherStdlib tests Hasher API with golang.org/x/crypto/sha3 standard library.
func BenchmarkKeccak256HasherStdlib(b *testing.B) {
for _, size := range benchSizes {
data := make([]byte, size)
for i := range data {
data[i] = byte(i)
}
b.Run("StdLib/"+benchName(size), func(b *testing.B) {
b.SetBytes(int64(size))
b.ReportAllocs()
h := sha3.NewLegacyKeccak256().(KeccakState)
var buf [32]byte
for b.Loop() {
h.Reset()
h.Write(data)
h.Read(buf[:])
}
})
}
}
// BenchmarkKeccakStreaming benchmarks the streaming hasher (Reset+Write+Read).
// Use with benchstat: go test -bench=BenchmarkKeccakStreaming -benchmem ./... | benchstat
func BenchmarkKeccakStreaming(b *testing.B) {
data := make([]byte, 32)
for i := range data {
data[i] = byte(i)
}
h := sha3.NewLegacyKeccak256().(KeccakState)
var h Hasher
var buf [32]byte
b.SetBytes(int64(len(data)))
b.ReportAllocs()

View file

@ -2,16 +2,20 @@
package keccak
import "unsafe"
import (
"unsafe"
"golang.org/x/sys/cpu"
)
func init() { useASM = cpu.X86.HasBMI2 }
//go:noescape
func keccakF1600(a *[200]byte)
func keccakF1600BMI2(a *[200]byte)
// Sum256 computes the Keccak-256 hash of data. Zero heap allocations.
func Sum256(data []byte) [32]byte { return sum256Sponge(data) }
// Hasher is a streaming Keccak-256 hasher. Designed for stack allocation.
type Hasher struct{ sponge }
func keccakF1600(a *[200]byte) {
keccakF1600BMI2(a)
}
func xorAndPermute(state *[200]byte, buf *byte) {
xorIn(state, unsafe.Slice(buf, rate))

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,156 @@
//go:build ignore
// gen_keccakf_bmi2.go generates keccakf_amd64_bmi2.s — a BMI2-optimized
// Keccak-f[1600] permutation using RORXQ and ANDNQ.
// Fully unrolled (all 24 rounds).
//
// Key optimizations:
// - D values kept in registers (R14, R15, BP, SI, DX), not on stack
// - State alternates between the original array (DI) and a 200-byte stack
// buffer, avoiding a second 200-byte copy
// - Frame is only 200 bytes (25 × 8 for temp state)
//
// Usage: go run gen_keccakf_bmi2.go
package main
import (
"fmt"
"os"
)
var rc = [24]uint64{
0x0000000000000001, 0x0000000000008082,
0x800000000000808a, 0x8000000080008000,
0x000000000000808b, 0x0000000080000001,
0x8000000080008081, 0x8000000000008009,
0x000000000000008a, 0x0000000000000088,
0x0000000080008009, 0x000000008000000a,
0x000000008000808b, 0x800000000000008b,
0x8000000000008089, 0x8000000000008003,
0x8000000000008002, 0x8000000000000080,
0x000000000000800a, 0x800000008000000a,
0x8000000080008081, 0x8000000000008080,
0x0000000080000001, 0x8000000080008008,
}
type lane struct {
idx int // state lane index (024)
rot int // left-rotation amount
}
// Chi groups: each group reads 5 lanes (after theta+rho+pi)
// and produces 5 consecutive output lanes.
var groups = [5][5]lane{
{{0, 0}, {6, 44}, {12, 43}, {18, 21}, {24, 14}}, // → lanes 04
{{3, 28}, {9, 20}, {10, 3}, {16, 45}, {22, 61}}, // → lanes 59
{{1, 1}, {7, 6}, {13, 25}, {19, 8}, {20, 18}}, // → lanes 1014
{{4, 27}, {5, 36}, {11, 10}, {17, 15}, {23, 56}}, // → lanes 1519
{{2, 62}, {8, 55}, {14, 39}, {15, 41}, {21, 2}}, // → lanes 2024
}
// D-value registers, indexed by lane%5.
var dReg = [5]string{"R14", "R15", "BP", "SI", "DX"}
const fsize = 200
var p func(string, ...any)
func main() {
f, err := os.Create("keccakf_amd64_bmi2.s")
if err != nil {
panic(err)
}
defer f.Close()
p = func(format string, args ...any) { fmt.Fprintf(f, format+"\n", args...) }
p("// Code generated by gen_keccakf_bmi2.go. DO NOT EDIT.")
p("")
p("//go:build !purego")
p("")
p("#include \"textflag.h\"")
p("")
// Function.
p("// func keccakF1600BMI2(a *[200]byte)")
p("TEXT ·keccakF1600BMI2(SB), NOSPLIT, $%d-8", fsize)
p("\tMOVQ a+0(FP), DI")
for round := 0; round < 24; round++ {
p("")
p("\t// Round %d", round)
srcArray := (round % 2) == 0
emitRound(srcArray, round)
}
p("\tRET")
}
// srcArray: true = source is array (DI), dest is stack (SP)
//
// false = source is stack (SP), dest is array (DI)
func emitRound(srcArray bool, round int) {
// Load round constant into R13.
p("\tMOVQ $0x%016x, R13", rc[round])
// Theta: 5 column parities → AX, BX, CX, DX, SI.
colR := [5]string{"AX", "BX", "CX", "DX", "SI"}
for c := 0; c < 5; c++ {
p("\tMOVQ %s, %s", off(c, srcArray), colR[c])
for r := 1; r < 5; r++ {
p("\tXORQ %s, %s", off(r*5+c, srcArray), colR[c])
}
}
// D values: D[x] = C[(x+4)%5] ^ rol(C[(x+1)%5], 1).
for _, x := range []int{0, 1, 2} {
p("\tRORXQ $63, %s, %s", colR[(x+1)%5], dReg[x])
p("\tXORQ %s, %s", colR[(x+4)%5], dReg[x])
}
// Do = CX ^ rol(SI, 1) → R8 temp, then move to SI
p("\tRORXQ $63, SI, R8")
p("\tXORQ CX, R8")
// Du = DX ^ rol(AX, 1) → R9 temp, then move to DX
p("\tRORXQ $63, AX, R9")
p("\tXORQ DX, R9")
p("\tMOVQ R8, SI") // SI = Do
p("\tMOVQ R9, DX") // DX = Du
// Five chi groups.
for g := 0; g < 5; g++ {
emitChi(g, srcArray, g == 0)
}
}
func emitChi(g int, srcArray, first bool) {
B := [5]string{"R8", "R9", "R10", "R11", "R12"}
// Load lane, XOR with D (register!), rotate.
for i := 0; i < 5; i++ {
l := groups[g][i]
p("\tMOVQ %s, %s", off(l.idx, srcArray), B[i])
p("\tXORQ %s, %s", dReg[l.idx%5], B[i])
if l.rot != 0 {
p("\tRORXQ $%d, %s, %s", 64-l.rot, B[i], B[i])
}
}
// Chi: out[j] = B[j] ^ (~B[(j+1)%5] & B[(j+2)%5]).
for j := 0; j < 5; j++ {
p("\tANDNQ %s, %s, AX", B[(j+2)%5], B[(j+1)%5])
p("\tXORQ %s, AX", B[j])
if first && j == 0 {
p("\tXORQ R13, AX")
}
p("\tMOVQ AX, %s", off(g*5+j, !srcArray))
}
}
// off returns the memory operand for lane idx.
func off(idx int, array bool) string {
o := idx * 8
if array {
return fmt.Sprintf("%d(DI)", o)
}
return fmt.Sprintf("%d(SP)", o)
}