crypto/keccak: fuse xor into asm keccak permutations

This commit is contained in:
Sahil Sojitra 2026-04-08 09:19:25 +05:30
parent a8c906d6bd
commit ee36b77785
7 changed files with 129 additions and 183 deletions

View file

@ -6,148 +6,18 @@
#include "textflag.h"
// func keccakF1600(a *[200]byte)
TEXT ·keccakF1600(SB), $200-8
// func keccakF1600Sha3(a *[200]byte, buf *byte)
// When buf != nil, XORs rate bytes into state before permuting.
// When buf == nil, just permutes.
TEXT ·keccakF1600Sha3(SB), $200-16
MOVD a+0(FP), R0
MOVD buf+8(FP), R3
MOVD $round_consts<>(SB), R1
MOVD $24, R2 // counter for loop
VLD1.P 16(R0), [V0.D1, V1.D1]
VLD1.P 16(R0), [V2.D1, V3.D1]
VLD1.P 16(R0), [V4.D1, V5.D1]
VLD1.P 16(R0), [V6.D1, V7.D1]
VLD1.P 16(R0), [V8.D1, V9.D1]
VLD1.P 16(R0), [V10.D1, V11.D1]
VLD1.P 16(R0), [V12.D1, V13.D1]
VLD1.P 16(R0), [V14.D1, V15.D1]
VLD1.P 16(R0), [V16.D1, V17.D1]
VLD1.P 16(R0), [V18.D1, V19.D1]
VLD1.P 16(R0), [V20.D1, V21.D1]
VLD1.P 16(R0), [V22.D1, V23.D1]
VLD1 (R0), [V24.D1]
CBZ R3, load_state
SUB $192, R0, R0
loop:
// theta
VEOR3 V20.B16, V15.B16, V10.B16, V25.B16
VEOR3 V21.B16, V16.B16, V11.B16, V26.B16
VEOR3 V22.B16, V17.B16, V12.B16, V27.B16
VEOR3 V23.B16, V18.B16, V13.B16, V28.B16
VEOR3 V24.B16, V19.B16, V14.B16, V29.B16
VEOR3 V25.B16, V5.B16, V0.B16, V25.B16
VEOR3 V26.B16, V6.B16, V1.B16, V26.B16
VEOR3 V27.B16, V7.B16, V2.B16, V27.B16
VEOR3 V28.B16, V8.B16, V3.B16, V28.B16
VEOR3 V29.B16, V9.B16, V4.B16, V29.B16
VRAX1 V27.D2, V25.D2, V30.D2
VRAX1 V28.D2, V26.D2, V31.D2
VRAX1 V29.D2, V27.D2, V27.D2
VRAX1 V25.D2, V28.D2, V28.D2
VRAX1 V26.D2, V29.D2, V29.D2
// theta and rho and Pi
VEOR V29.B16, V0.B16, V0.B16
VXAR $63, V30.D2, V1.D2, V25.D2
VXAR $20, V30.D2, V6.D2, V1.D2
VXAR $44, V28.D2, V9.D2, V6.D2
VXAR $3, V31.D2, V22.D2, V9.D2
VXAR $25, V28.D2, V14.D2, V22.D2
VXAR $46, V29.D2, V20.D2, V14.D2
VXAR $2, V31.D2, V2.D2, V26.D2
VXAR $21, V31.D2, V12.D2, V2.D2
VXAR $39, V27.D2, V13.D2, V12.D2
VXAR $56, V28.D2, V19.D2, V13.D2
VXAR $8, V27.D2, V23.D2, V19.D2
VXAR $23, V29.D2, V15.D2, V23.D2
VXAR $37, V28.D2, V4.D2, V15.D2
VXAR $50, V28.D2, V24.D2, V28.D2
VXAR $62, V30.D2, V21.D2, V24.D2
VXAR $9, V27.D2, V8.D2, V8.D2
VXAR $19, V30.D2, V16.D2, V4.D2
VXAR $28, V29.D2, V5.D2, V16.D2
VXAR $36, V27.D2, V3.D2, V5.D2
VXAR $43, V27.D2, V18.D2, V27.D2
VXAR $49, V31.D2, V17.D2, V3.D2
VXAR $54, V30.D2, V11.D2, V30.D2
VXAR $58, V31.D2, V7.D2, V31.D2
VXAR $61, V29.D2, V10.D2, V29.D2
// chi and iota
VBCAX V8.B16, V22.B16, V26.B16, V20.B16
VBCAX V22.B16, V23.B16, V8.B16, V21.B16
VBCAX V23.B16, V24.B16, V22.B16, V22.B16
VBCAX V24.B16, V26.B16, V23.B16, V23.B16
VBCAX V26.B16, V8.B16, V24.B16, V24.B16
VLD1R.P 8(R1), [V26.D2]
VBCAX V3.B16, V19.B16, V30.B16, V17.B16
VBCAX V19.B16, V15.B16, V3.B16, V18.B16
VBCAX V15.B16, V16.B16, V19.B16, V19.B16
VBCAX V16.B16, V30.B16, V15.B16, V15.B16
VBCAX V30.B16, V3.B16, V16.B16, V16.B16
VBCAX V31.B16, V12.B16, V25.B16, V10.B16
VBCAX V12.B16, V13.B16, V31.B16, V11.B16
VBCAX V13.B16, V14.B16, V12.B16, V12.B16
VBCAX V14.B16, V25.B16, V13.B16, V13.B16
VBCAX V25.B16, V31.B16, V14.B16, V14.B16
VBCAX V4.B16, V9.B16, V29.B16, V7.B16
VBCAX V9.B16, V5.B16, V4.B16, V8.B16
VBCAX V5.B16, V6.B16, V9.B16, V9.B16
VBCAX V6.B16, V29.B16, V5.B16, V5.B16
VBCAX V29.B16, V4.B16, V6.B16, V6.B16
VBCAX V28.B16, V0.B16, V27.B16, V3.B16
VBCAX V0.B16, V1.B16, V28.B16, V4.B16
VBCAX V1.B16, V2.B16, V0.B16, V0.B16 // iota (chi part)
VBCAX V2.B16, V27.B16, V1.B16, V1.B16
VBCAX V27.B16, V28.B16, V2.B16, V2.B16
VEOR V26.B16, V0.B16, V0.B16 // iota
SUB $1, R2, R2
CBNZ R2, loop
VST1.P [V0.D1, V1.D1], 16(R0)
VST1.P [V2.D1, V3.D1], 16(R0)
VST1.P [V4.D1, V5.D1], 16(R0)
VST1.P [V6.D1, V7.D1], 16(R0)
VST1.P [V8.D1, V9.D1], 16(R0)
VST1.P [V10.D1, V11.D1], 16(R0)
VST1.P [V12.D1, V13.D1], 16(R0)
VST1.P [V14.D1, V15.D1], 16(R0)
VST1.P [V16.D1, V17.D1], 16(R0)
VST1.P [V18.D1, V19.D1], 16(R0)
VST1.P [V20.D1, V21.D1], 16(R0)
VST1.P [V22.D1, V23.D1], 16(R0)
VST1 [V24.D1], (R0)
RET
// func xorAndPermute(state *[200]byte, buf *byte)
// Loads state, XORs a full rate (136 bytes = 17 lanes) of data, then runs keccakF1600.
// Eliminates one state store+load cycle per block vs separate xorIn + keccakF1600.
TEXT ·xorAndPermute(SB), $200-16
MOVD state+0(FP), R0
MOVD buf+8(FP), R3
MOVD $round_consts<>(SB), R1
MOVD $24, R2
// Load state and XOR data for lanes 0-15 (8 pairs × 16 bytes = 128 bytes)
// XOR path: load state and XOR with buf (17 lanes = 136 bytes)
VLD1.P 16(R0), [V0.D1, V1.D1]
VLD1.P 16(R3), [V25.D1, V26.D1]
VEOR V25.B16, V0.B16, V0.B16
@ -188,7 +58,7 @@ TEXT ·xorAndPermute(SB), $200-16
VEOR V25.B16, V14.B16, V14.B16
VEOR V26.B16, V15.B16, V15.B16
// Lane 16-17: XOR only lane 16 (last data lane, 8 bytes at data offset 128)
// Lane 16: last data lane (8 bytes at buf offset 128)
VLD1.P 16(R0), [V16.D1, V17.D1]
VLD1 (R3), [V25.D1]
VEOR V25.B16, V16.B16, V16.B16
@ -200,8 +70,26 @@ TEXT ·xorAndPermute(SB), $200-16
VLD1 (R0), [V24.D1]
SUB $192, R0, R0
B rounds
loop_xp:
load_state:
VLD1.P 16(R0), [V0.D1, V1.D1]
VLD1.P 16(R0), [V2.D1, V3.D1]
VLD1.P 16(R0), [V4.D1, V5.D1]
VLD1.P 16(R0), [V6.D1, V7.D1]
VLD1.P 16(R0), [V8.D1, V9.D1]
VLD1.P 16(R0), [V10.D1, V11.D1]
VLD1.P 16(R0), [V12.D1, V13.D1]
VLD1.P 16(R0), [V14.D1, V15.D1]
VLD1.P 16(R0), [V16.D1, V17.D1]
VLD1.P 16(R0), [V18.D1, V19.D1]
VLD1.P 16(R0), [V20.D1, V21.D1]
VLD1.P 16(R0), [V22.D1, V23.D1]
VLD1 (R0), [V24.D1]
SUB $192, R0, R0
rounds:
// theta
VEOR3 V20.B16, V15.B16, V10.B16, V25.B16
VEOR3 V21.B16, V16.B16, V11.B16, V26.B16
@ -293,7 +181,7 @@ loop_xp:
VEOR V26.B16, V0.B16, V0.B16 // iota
SUB $1, R2, R2
CBNZ R2, loop_xp
CBNZ R2, rounds
VST1.P [V0.D1, V1.D1], 16(R0)
VST1.P [V2.D1, V3.D1], 16(R0)

View file

@ -15,8 +15,16 @@ func init() {
useASM = runtime.GOOS == "darwin" || runtime.GOOS == "ios" || cpu.ARM64.HasSHA3
}
// keccakF1600Sha3 permutes state. When buf != nil, it first XORs rate bytes
// of buf into state, saving one full memory pass.
//
//go:noescape
func keccakF1600(a *[200]byte)
func keccakF1600Sha3(a *[200]byte, buf *byte)
//go:noescape
func xorAndPermute(state *[200]byte, buf *byte)
func keccakF1600(a *[200]byte) {
keccakF1600Sha3(a, nil)
}
func xorAndPermute(state *[200]byte, buf *byte) {
keccakF1600Sha3(state, buf)
}

View file

@ -3,7 +3,7 @@
package keccak
import (
"unsafe"
"encoding/binary"
"golang.org/x/crypto/sha3"
)
@ -59,7 +59,11 @@ func (s *sponge) Write(p []byte) (int, error) {
// Sum256 finalizes and returns the 32-byte Keccak-256 digest.
// Does not modify the sponge state.
// Panics if called after Read.
func (s *sponge) Sum256() [32]byte {
if s.squeezing {
panic("keccak: Sum after Read")
}
state := s.state
xorIn(&state, s.buf[:s.absorbed])
state[s.absorbed] ^= 0x01
@ -217,14 +221,13 @@ func (h *Hasher) Read(out []byte) (int, error) {
return h.sponge.Read(out)
}
// xorIn XORs data into the first len(data) bytes of state using uint64 loads.
func xorIn(state *[200]byte, data []byte) {
stateU64 := (*[25]uint64)(unsafe.Pointer(state))
n := len(data) >> 3
p := unsafe.Pointer(unsafe.SliceData(data))
for i := range n {
stateU64[i] ^= *(*uint64)(unsafe.Add(p, uintptr(i)<<3))
for i := 0; i+8 <= len(data); i += 8 {
v := binary.LittleEndian.Uint64(state[i:]) ^ binary.LittleEndian.Uint64(data[i:])
binary.LittleEndian.PutUint64(state[i:], v)
}
for i := n << 3; i < len(data); i++ {
for i := len(data) &^ 7; i < len(data); i++ {
state[i] ^= data[i]
}
}

View file

@ -246,25 +246,14 @@ func FuzzSum256(f *testing.F) {
})
}
func BenchmarkSum256_500K(b *testing.B) {
data := make([]byte, 500*1024)
b.SetBytes(int64(len(data)))
b.ReportAllocs()
for b.Loop() {
Sum256(data)
}
}
// Comparison benchmarks: faster_keccak vs golang.org/x/crypto/sha3.
var benchSizes = []int{32, 128, 256, 1024, 4096, 500 * 1024}
func benchName(size int) string {
switch {
case size >= 1024:
if size >= 1024 {
return fmt.Sprintf("%dK", size/1024)
default:
return fmt.Sprintf("%dB", size)
}
return fmt.Sprintf("%dB", size)
}
// BenchmarkKeccak256Sum tests Sum256 with local faster_keccak implementation.

View file

@ -2,22 +2,20 @@
package keccak
import (
"unsafe"
import "golang.org/x/sys/cpu"
"golang.org/x/sys/cpu"
)
func init() { useASM = cpu.X86.HasBMI2 && cpu.X86.HasBMI1 }
func init() { useASM = cpu.X86.HasBMI1 && cpu.X86.HasBMI2 }
// keccakF1600BMI2 permutes state. When buf != nil, it first XORs rate bytes
// of buf into state, saving one full memory pass.
//
//go:noescape
func keccakF1600BMI2(a *[200]byte)
func keccakF1600BMI2(a *[200]byte, buf *byte)
func keccakF1600(a *[200]byte) {
keccakF1600BMI2(a)
keccakF1600BMI2(a, nil)
}
func xorAndPermute(state *[200]byte, buf *byte) {
xorIn(state, unsafe.Slice(buf, rate))
keccakF1600(state)
keccakF1600BMI2(state, buf)
}

View file

@ -4,9 +4,50 @@
#include "textflag.h"
// func keccakF1600BMI2(a *[200]byte)
TEXT ·keccakF1600BMI2(SB), NOSPLIT, $200-8
// func keccakF1600BMI2(a *[200]byte, buf *byte)
TEXT ·keccakF1600BMI2(SB), NOSPLIT, $200-16
MOVQ a+0(FP), DI
MOVQ buf+8(FP), BX
TESTQ BX, BX
JZ rounds
// XOR 17 lanes (136 bytes) of buf into state.
MOVQ 0(BX), AX
XORQ AX, 0(DI)
MOVQ 8(BX), AX
XORQ AX, 8(DI)
MOVQ 16(BX), AX
XORQ AX, 16(DI)
MOVQ 24(BX), AX
XORQ AX, 24(DI)
MOVQ 32(BX), AX
XORQ AX, 32(DI)
MOVQ 40(BX), AX
XORQ AX, 40(DI)
MOVQ 48(BX), AX
XORQ AX, 48(DI)
MOVQ 56(BX), AX
XORQ AX, 56(DI)
MOVQ 64(BX), AX
XORQ AX, 64(DI)
MOVQ 72(BX), AX
XORQ AX, 72(DI)
MOVQ 80(BX), AX
XORQ AX, 80(DI)
MOVQ 88(BX), AX
XORQ AX, 88(DI)
MOVQ 96(BX), AX
XORQ AX, 96(DI)
MOVQ 104(BX), AX
XORQ AX, 104(DI)
MOVQ 112(BX), AX
XORQ AX, 112(DI)
MOVQ 120(BX), AX
XORQ AX, 120(DI)
MOVQ 128(BX), AX
XORQ AX, 128(DI)
rounds:
// Round 0
MOVQ $0x0000000000000001, R13

View file

@ -9,6 +9,8 @@
// - State alternates between the original array (DI) and a 200-byte stack
// buffer, avoiding a second 200-byte copy
// - Frame is only 200 bytes (25 × 8 for temp state)
// - Optional XOR-and-permute: when buf != nil, XORs rate bytes into state
// before permuting, eliminating one full memory pass
//
// Usage: go run gen_keccakf_bmi2.go
@ -52,7 +54,10 @@ var groups = [5][5]lane{
// D-value registers, indexed by lane%5.
var dReg = [5]string{"R14", "R15", "BP", "SI", "DX"}
const fsize = 200
const (
fsize = 200
rateLanes = 17 // rate / 8 = 136 / 8 = 17 lanes
)
var p func(string, ...any)
@ -71,10 +76,23 @@ func main() {
p("#include \"textflag.h\"")
p("")
// Function.
p("// func keccakF1600BMI2(a *[200]byte)")
p("TEXT ·keccakF1600BMI2(SB), NOSPLIT, $%d-8", fsize)
// Single function: keccakF1600BMI2(a *[200]byte, buf *byte)
// When buf != nil, XORs rate bytes into state before permuting.
// When buf == nil, just permutes.
p("// func keccakF1600BMI2(a *[200]byte, buf *byte)")
p("TEXT ·keccakF1600BMI2(SB), NOSPLIT, $%d-16", fsize)
p("\tMOVQ a+0(FP), DI")
p("\tMOVQ buf+8(FP), BX")
p("\tTESTQ BX, BX")
p("\tJZ rounds")
p("")
p("\t// XOR %d lanes (%d bytes) of buf into state.", rateLanes, rateLanes*8)
for i := 0; i < rateLanes; i++ {
p("\tMOVQ %d(BX), AX", i*8)
p("\tXORQ AX, %d(DI)", i*8)
}
p("")
p("rounds:")
for round := 0; round < 24; round++ {
p("")
@ -103,18 +121,19 @@ func emitRound(srcArray bool, round int) {
}
// D values: D[x] = C[(x+4)%5] ^ rol(C[(x+1)%5], 1).
// D[0..2] go directly into R14, R15, BP (no conflicts).
for _, x := range []int{0, 1, 2} {
p("\tRORXQ $63, %s, %s", colR[(x+1)%5], dReg[x])
p("\tXORQ %s, %s", colR[(x+4)%5], dReg[x])
}
// Do = CX ^ rol(SI, 1) → R8 temp, then move to SI
// D[3] and D[4] target SI and DX, which still hold column parities
// C[4] and C[3] needed as inputs, so compute via temps first.
p("\tRORXQ $63, SI, R8")
p("\tXORQ CX, R8")
// Du = DX ^ rol(AX, 1) → R9 temp, then move to DX
p("\tRORXQ $63, AX, R9")
p("\tXORQ DX, R9")
p("\tMOVQ R8, SI") // SI = Do
p("\tMOVQ R9, DX") // DX = Du
p("\tMOVQ R8, SI") // SI = D[3]
p("\tMOVQ R9, DX") // DX = D[4]
// Five chi groups.
for g := 0; g < 5; g++ {