diff --git a/crypto/keccak/keccaf_arm64.s b/crypto/keccak/keccaf_arm64.s index 21af4540f9..a5f4617fc0 100644 --- a/crypto/keccak/keccaf_arm64.s +++ b/crypto/keccak/keccaf_arm64.s @@ -6,148 +6,18 @@ #include "textflag.h" -// func keccakF1600(a *[200]byte) -TEXT ·keccakF1600(SB), $200-8 +// func keccakF1600Sha3(a *[200]byte, buf *byte) +// When buf != nil, XORs rate bytes into state before permuting. +// When buf == nil, just permutes. +TEXT ·keccakF1600Sha3(SB), $200-16 MOVD a+0(FP), R0 + MOVD buf+8(FP), R3 MOVD $round_consts<>(SB), R1 MOVD $24, R2 // counter for loop - VLD1.P 16(R0), [V0.D1, V1.D1] - VLD1.P 16(R0), [V2.D1, V3.D1] - VLD1.P 16(R0), [V4.D1, V5.D1] - VLD1.P 16(R0), [V6.D1, V7.D1] - VLD1.P 16(R0), [V8.D1, V9.D1] - VLD1.P 16(R0), [V10.D1, V11.D1] - VLD1.P 16(R0), [V12.D1, V13.D1] - VLD1.P 16(R0), [V14.D1, V15.D1] - VLD1.P 16(R0), [V16.D1, V17.D1] - VLD1.P 16(R0), [V18.D1, V19.D1] - VLD1.P 16(R0), [V20.D1, V21.D1] - VLD1.P 16(R0), [V22.D1, V23.D1] - VLD1 (R0), [V24.D1] + CBZ R3, load_state - SUB $192, R0, R0 - -loop: - // theta - VEOR3 V20.B16, V15.B16, V10.B16, V25.B16 - VEOR3 V21.B16, V16.B16, V11.B16, V26.B16 - VEOR3 V22.B16, V17.B16, V12.B16, V27.B16 - VEOR3 V23.B16, V18.B16, V13.B16, V28.B16 - VEOR3 V24.B16, V19.B16, V14.B16, V29.B16 - VEOR3 V25.B16, V5.B16, V0.B16, V25.B16 - VEOR3 V26.B16, V6.B16, V1.B16, V26.B16 - VEOR3 V27.B16, V7.B16, V2.B16, V27.B16 - VEOR3 V28.B16, V8.B16, V3.B16, V28.B16 - VEOR3 V29.B16, V9.B16, V4.B16, V29.B16 - - VRAX1 V27.D2, V25.D2, V30.D2 - VRAX1 V28.D2, V26.D2, V31.D2 - VRAX1 V29.D2, V27.D2, V27.D2 - VRAX1 V25.D2, V28.D2, V28.D2 - VRAX1 V26.D2, V29.D2, V29.D2 - - // theta and rho and Pi - VEOR V29.B16, V0.B16, V0.B16 - - VXAR $63, V30.D2, V1.D2, V25.D2 - - VXAR $20, V30.D2, V6.D2, V1.D2 - VXAR $44, V28.D2, V9.D2, V6.D2 - VXAR $3, V31.D2, V22.D2, V9.D2 - VXAR $25, V28.D2, V14.D2, V22.D2 - VXAR $46, V29.D2, V20.D2, V14.D2 - - VXAR $2, V31.D2, V2.D2, V26.D2 - - VXAR $21, V31.D2, V12.D2, V2.D2 - VXAR $39, V27.D2, V13.D2, V12.D2 - VXAR $56, V28.D2, V19.D2, V13.D2 - VXAR $8, V27.D2, V23.D2, V19.D2 - VXAR $23, V29.D2, V15.D2, V23.D2 - - VXAR $37, V28.D2, V4.D2, V15.D2 - - VXAR $50, V28.D2, V24.D2, V28.D2 - VXAR $62, V30.D2, V21.D2, V24.D2 - VXAR $9, V27.D2, V8.D2, V8.D2 - VXAR $19, V30.D2, V16.D2, V4.D2 - VXAR $28, V29.D2, V5.D2, V16.D2 - - VXAR $36, V27.D2, V3.D2, V5.D2 - - VXAR $43, V27.D2, V18.D2, V27.D2 - VXAR $49, V31.D2, V17.D2, V3.D2 - VXAR $54, V30.D2, V11.D2, V30.D2 - VXAR $58, V31.D2, V7.D2, V31.D2 - VXAR $61, V29.D2, V10.D2, V29.D2 - - // chi and iota - VBCAX V8.B16, V22.B16, V26.B16, V20.B16 - VBCAX V22.B16, V23.B16, V8.B16, V21.B16 - VBCAX V23.B16, V24.B16, V22.B16, V22.B16 - VBCAX V24.B16, V26.B16, V23.B16, V23.B16 - VBCAX V26.B16, V8.B16, V24.B16, V24.B16 - - VLD1R.P 8(R1), [V26.D2] - - VBCAX V3.B16, V19.B16, V30.B16, V17.B16 - VBCAX V19.B16, V15.B16, V3.B16, V18.B16 - VBCAX V15.B16, V16.B16, V19.B16, V19.B16 - VBCAX V16.B16, V30.B16, V15.B16, V15.B16 - VBCAX V30.B16, V3.B16, V16.B16, V16.B16 - - VBCAX V31.B16, V12.B16, V25.B16, V10.B16 - VBCAX V12.B16, V13.B16, V31.B16, V11.B16 - VBCAX V13.B16, V14.B16, V12.B16, V12.B16 - VBCAX V14.B16, V25.B16, V13.B16, V13.B16 - VBCAX V25.B16, V31.B16, V14.B16, V14.B16 - - VBCAX V4.B16, V9.B16, V29.B16, V7.B16 - VBCAX V9.B16, V5.B16, V4.B16, V8.B16 - VBCAX V5.B16, V6.B16, V9.B16, V9.B16 - VBCAX V6.B16, V29.B16, V5.B16, V5.B16 - VBCAX V29.B16, V4.B16, V6.B16, V6.B16 - - VBCAX V28.B16, V0.B16, V27.B16, V3.B16 - VBCAX V0.B16, V1.B16, V28.B16, V4.B16 - - VBCAX V1.B16, V2.B16, V0.B16, V0.B16 // iota (chi part) - - VBCAX V2.B16, V27.B16, V1.B16, V1.B16 - VBCAX V27.B16, V28.B16, V2.B16, V2.B16 - - VEOR V26.B16, V0.B16, V0.B16 // iota - - SUB $1, R2, R2 - CBNZ R2, loop - - VST1.P [V0.D1, V1.D1], 16(R0) - VST1.P [V2.D1, V3.D1], 16(R0) - VST1.P [V4.D1, V5.D1], 16(R0) - VST1.P [V6.D1, V7.D1], 16(R0) - VST1.P [V8.D1, V9.D1], 16(R0) - VST1.P [V10.D1, V11.D1], 16(R0) - VST1.P [V12.D1, V13.D1], 16(R0) - VST1.P [V14.D1, V15.D1], 16(R0) - VST1.P [V16.D1, V17.D1], 16(R0) - VST1.P [V18.D1, V19.D1], 16(R0) - VST1.P [V20.D1, V21.D1], 16(R0) - VST1.P [V22.D1, V23.D1], 16(R0) - VST1 [V24.D1], (R0) - - RET - -// func xorAndPermute(state *[200]byte, buf *byte) -// Loads state, XORs a full rate (136 bytes = 17 lanes) of data, then runs keccakF1600. -// Eliminates one state store+load cycle per block vs separate xorIn + keccakF1600. -TEXT ·xorAndPermute(SB), $200-16 - MOVD state+0(FP), R0 - MOVD buf+8(FP), R3 - MOVD $round_consts<>(SB), R1 - MOVD $24, R2 - - // Load state and XOR data for lanes 0-15 (8 pairs × 16 bytes = 128 bytes) + // XOR path: load state and XOR with buf (17 lanes = 136 bytes) VLD1.P 16(R0), [V0.D1, V1.D1] VLD1.P 16(R3), [V25.D1, V26.D1] VEOR V25.B16, V0.B16, V0.B16 @@ -188,7 +58,7 @@ TEXT ·xorAndPermute(SB), $200-16 VEOR V25.B16, V14.B16, V14.B16 VEOR V26.B16, V15.B16, V15.B16 - // Lane 16-17: XOR only lane 16 (last data lane, 8 bytes at data offset 128) + // Lane 16: last data lane (8 bytes at buf offset 128) VLD1.P 16(R0), [V16.D1, V17.D1] VLD1 (R3), [V25.D1] VEOR V25.B16, V16.B16, V16.B16 @@ -200,8 +70,26 @@ TEXT ·xorAndPermute(SB), $200-16 VLD1 (R0), [V24.D1] SUB $192, R0, R0 + B rounds -loop_xp: +load_state: + VLD1.P 16(R0), [V0.D1, V1.D1] + VLD1.P 16(R0), [V2.D1, V3.D1] + VLD1.P 16(R0), [V4.D1, V5.D1] + VLD1.P 16(R0), [V6.D1, V7.D1] + VLD1.P 16(R0), [V8.D1, V9.D1] + VLD1.P 16(R0), [V10.D1, V11.D1] + VLD1.P 16(R0), [V12.D1, V13.D1] + VLD1.P 16(R0), [V14.D1, V15.D1] + VLD1.P 16(R0), [V16.D1, V17.D1] + VLD1.P 16(R0), [V18.D1, V19.D1] + VLD1.P 16(R0), [V20.D1, V21.D1] + VLD1.P 16(R0), [V22.D1, V23.D1] + VLD1 (R0), [V24.D1] + + SUB $192, R0, R0 + +rounds: // theta VEOR3 V20.B16, V15.B16, V10.B16, V25.B16 VEOR3 V21.B16, V16.B16, V11.B16, V26.B16 @@ -293,7 +181,7 @@ loop_xp: VEOR V26.B16, V0.B16, V0.B16 // iota SUB $1, R2, R2 - CBNZ R2, loop_xp + CBNZ R2, rounds VST1.P [V0.D1, V1.D1], 16(R0) VST1.P [V2.D1, V3.D1], 16(R0) diff --git a/crypto/keccak/keccak_arm64.go b/crypto/keccak/keccak_arm64.go index a7e2800448..3b9c879fe2 100644 --- a/crypto/keccak/keccak_arm64.go +++ b/crypto/keccak/keccak_arm64.go @@ -15,8 +15,16 @@ func init() { useASM = runtime.GOOS == "darwin" || runtime.GOOS == "ios" || cpu.ARM64.HasSHA3 } +// keccakF1600Sha3 permutes state. When buf != nil, it first XORs rate bytes +// of buf into state, saving one full memory pass. +// //go:noescape -func keccakF1600(a *[200]byte) +func keccakF1600Sha3(a *[200]byte, buf *byte) -//go:noescape -func xorAndPermute(state *[200]byte, buf *byte) +func keccakF1600(a *[200]byte) { + keccakF1600Sha3(a, nil) +} + +func xorAndPermute(state *[200]byte, buf *byte) { + keccakF1600Sha3(state, buf) +} diff --git a/crypto/keccak/keccak_asm.go b/crypto/keccak/keccak_asm.go index 3a12f7b775..37ce284f02 100644 --- a/crypto/keccak/keccak_asm.go +++ b/crypto/keccak/keccak_asm.go @@ -3,7 +3,7 @@ package keccak import ( - "unsafe" + "encoding/binary" "golang.org/x/crypto/sha3" ) @@ -59,7 +59,11 @@ func (s *sponge) Write(p []byte) (int, error) { // Sum256 finalizes and returns the 32-byte Keccak-256 digest. // Does not modify the sponge state. +// Panics if called after Read. func (s *sponge) Sum256() [32]byte { + if s.squeezing { + panic("keccak: Sum after Read") + } state := s.state xorIn(&state, s.buf[:s.absorbed]) state[s.absorbed] ^= 0x01 @@ -217,14 +221,13 @@ func (h *Hasher) Read(out []byte) (int, error) { return h.sponge.Read(out) } +// xorIn XORs data into the first len(data) bytes of state using uint64 loads. func xorIn(state *[200]byte, data []byte) { - stateU64 := (*[25]uint64)(unsafe.Pointer(state)) - n := len(data) >> 3 - p := unsafe.Pointer(unsafe.SliceData(data)) - for i := range n { - stateU64[i] ^= *(*uint64)(unsafe.Add(p, uintptr(i)<<3)) + for i := 0; i+8 <= len(data); i += 8 { + v := binary.LittleEndian.Uint64(state[i:]) ^ binary.LittleEndian.Uint64(data[i:]) + binary.LittleEndian.PutUint64(state[i:], v) } - for i := n << 3; i < len(data); i++ { + for i := len(data) &^ 7; i < len(data); i++ { state[i] ^= data[i] } } diff --git a/crypto/keccak/keccak_test.go b/crypto/keccak/keccak_test.go index 65ea21fcb3..4a9875585c 100644 --- a/crypto/keccak/keccak_test.go +++ b/crypto/keccak/keccak_test.go @@ -246,25 +246,14 @@ func FuzzSum256(f *testing.F) { }) } -func BenchmarkSum256_500K(b *testing.B) { - data := make([]byte, 500*1024) - b.SetBytes(int64(len(data))) - b.ReportAllocs() - for b.Loop() { - Sum256(data) - } -} - // Comparison benchmarks: faster_keccak vs golang.org/x/crypto/sha3. var benchSizes = []int{32, 128, 256, 1024, 4096, 500 * 1024} func benchName(size int) string { - switch { - case size >= 1024: + if size >= 1024 { return fmt.Sprintf("%dK", size/1024) - default: - return fmt.Sprintf("%dB", size) } + return fmt.Sprintf("%dB", size) } // BenchmarkKeccak256Sum tests Sum256 with local faster_keccak implementation. diff --git a/crypto/keccak/keccakf_amd64.go b/crypto/keccak/keccakf_amd64.go index e9a0610cc4..b4565dd58e 100644 --- a/crypto/keccak/keccakf_amd64.go +++ b/crypto/keccak/keccakf_amd64.go @@ -2,22 +2,20 @@ package keccak -import ( - "unsafe" +import "golang.org/x/sys/cpu" - "golang.org/x/sys/cpu" -) - -func init() { useASM = cpu.X86.HasBMI2 && cpu.X86.HasBMI1 } +func init() { useASM = cpu.X86.HasBMI1 && cpu.X86.HasBMI2 } +// keccakF1600BMI2 permutes state. When buf != nil, it first XORs rate bytes +// of buf into state, saving one full memory pass. +// //go:noescape -func keccakF1600BMI2(a *[200]byte) +func keccakF1600BMI2(a *[200]byte, buf *byte) func keccakF1600(a *[200]byte) { - keccakF1600BMI2(a) + keccakF1600BMI2(a, nil) } func xorAndPermute(state *[200]byte, buf *byte) { - xorIn(state, unsafe.Slice(buf, rate)) - keccakF1600(state) + keccakF1600BMI2(state, buf) } diff --git a/crypto/keccak/keccakf_amd64_bmi2.s b/crypto/keccak/keccakf_amd64_bmi2.s index fa3e1c33ae..f38381bfff 100644 --- a/crypto/keccak/keccakf_amd64_bmi2.s +++ b/crypto/keccak/keccakf_amd64_bmi2.s @@ -4,9 +4,50 @@ #include "textflag.h" -// func keccakF1600BMI2(a *[200]byte) -TEXT ·keccakF1600BMI2(SB), NOSPLIT, $200-8 +// func keccakF1600BMI2(a *[200]byte, buf *byte) +TEXT ·keccakF1600BMI2(SB), NOSPLIT, $200-16 MOVQ a+0(FP), DI + MOVQ buf+8(FP), BX + TESTQ BX, BX + JZ rounds + + // XOR 17 lanes (136 bytes) of buf into state. + MOVQ 0(BX), AX + XORQ AX, 0(DI) + MOVQ 8(BX), AX + XORQ AX, 8(DI) + MOVQ 16(BX), AX + XORQ AX, 16(DI) + MOVQ 24(BX), AX + XORQ AX, 24(DI) + MOVQ 32(BX), AX + XORQ AX, 32(DI) + MOVQ 40(BX), AX + XORQ AX, 40(DI) + MOVQ 48(BX), AX + XORQ AX, 48(DI) + MOVQ 56(BX), AX + XORQ AX, 56(DI) + MOVQ 64(BX), AX + XORQ AX, 64(DI) + MOVQ 72(BX), AX + XORQ AX, 72(DI) + MOVQ 80(BX), AX + XORQ AX, 80(DI) + MOVQ 88(BX), AX + XORQ AX, 88(DI) + MOVQ 96(BX), AX + XORQ AX, 96(DI) + MOVQ 104(BX), AX + XORQ AX, 104(DI) + MOVQ 112(BX), AX + XORQ AX, 112(DI) + MOVQ 120(BX), AX + XORQ AX, 120(DI) + MOVQ 128(BX), AX + XORQ AX, 128(DI) + +rounds: // Round 0 MOVQ $0x0000000000000001, R13 diff --git a/crypto/keccak/‎gen_keccakf_bmi2.go b/crypto/keccak/‎gen_keccakf_bmi2.go index 457bc7bfcb..1773fa11dd 100644 --- a/crypto/keccak/‎gen_keccakf_bmi2.go +++ b/crypto/keccak/‎gen_keccakf_bmi2.go @@ -9,6 +9,8 @@ // - State alternates between the original array (DI) and a 200-byte stack // buffer, avoiding a second 200-byte copy // - Frame is only 200 bytes (25 × 8 for temp state) +// - Optional XOR-and-permute: when buf != nil, XORs rate bytes into state +// before permuting, eliminating one full memory pass // // Usage: go run gen_keccakf_bmi2.go @@ -52,7 +54,10 @@ var groups = [5][5]lane{ // D-value registers, indexed by lane%5. var dReg = [5]string{"R14", "R15", "BP", "SI", "DX"} -const fsize = 200 +const ( + fsize = 200 + rateLanes = 17 // rate / 8 = 136 / 8 = 17 lanes +) var p func(string, ...any) @@ -71,10 +76,23 @@ func main() { p("#include \"textflag.h\"") p("") - // Function. - p("// func keccakF1600BMI2(a *[200]byte)") - p("TEXT ·keccakF1600BMI2(SB), NOSPLIT, $%d-8", fsize) + // Single function: keccakF1600BMI2(a *[200]byte, buf *byte) + // When buf != nil, XORs rate bytes into state before permuting. + // When buf == nil, just permutes. + p("// func keccakF1600BMI2(a *[200]byte, buf *byte)") + p("TEXT ·keccakF1600BMI2(SB), NOSPLIT, $%d-16", fsize) p("\tMOVQ a+0(FP), DI") + p("\tMOVQ buf+8(FP), BX") + p("\tTESTQ BX, BX") + p("\tJZ rounds") + p("") + p("\t// XOR %d lanes (%d bytes) of buf into state.", rateLanes, rateLanes*8) + for i := 0; i < rateLanes; i++ { + p("\tMOVQ %d(BX), AX", i*8) + p("\tXORQ AX, %d(DI)", i*8) + } + p("") + p("rounds:") for round := 0; round < 24; round++ { p("") @@ -103,18 +121,19 @@ func emitRound(srcArray bool, round int) { } // D values: D[x] = C[(x+4)%5] ^ rol(C[(x+1)%5], 1). + // D[0..2] go directly into R14, R15, BP (no conflicts). for _, x := range []int{0, 1, 2} { p("\tRORXQ $63, %s, %s", colR[(x+1)%5], dReg[x]) p("\tXORQ %s, %s", colR[(x+4)%5], dReg[x]) } - // Do = CX ^ rol(SI, 1) → R8 temp, then move to SI + // D[3] and D[4] target SI and DX, which still hold column parities + // C[4] and C[3] needed as inputs, so compute via temps first. p("\tRORXQ $63, SI, R8") p("\tXORQ CX, R8") - // Du = DX ^ rol(AX, 1) → R9 temp, then move to DX p("\tRORXQ $63, AX, R9") p("\tXORQ DX, R9") - p("\tMOVQ R8, SI") // SI = Do - p("\tMOVQ R9, DX") // DX = Du + p("\tMOVQ R8, SI") // SI = D[3] + p("\tMOVQ R9, DX") // DX = D[4] // Five chi groups. for g := 0; g < 5; g++ {