mirror of
https://github.com/ethereum/go-ethereum.git
synced 2026-05-24 08:49:29 +00:00
226 lines
6.5 KiB
ArmAsm
226 lines
6.5 KiB
ArmAsm
// Copyright 2022 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
//go:build !purego
|
|
|
|
#include "textflag.h"
|
|
|
|
// func keccakF1600Sha3(a *[200]byte, buf *byte)
|
|
// When buf != nil, XORs rate bytes into state before permuting.
|
|
// When buf == nil, just permutes.
|
|
TEXT ·keccakF1600Sha3(SB), $200-16
|
|
MOVD a+0(FP), R0
|
|
MOVD buf+8(FP), R3
|
|
MOVD $round_consts<>(SB), R1
|
|
MOVD $24, R2 // counter for loop
|
|
|
|
CBZ R3, load_state
|
|
|
|
// XOR path: load state and XOR with buf (17 lanes = 136 bytes)
|
|
VLD1.P 16(R0), [V0.D1, V1.D1]
|
|
VLD1.P 16(R3), [V25.D1, V26.D1]
|
|
VEOR V25.B16, V0.B16, V0.B16
|
|
VEOR V26.B16, V1.B16, V1.B16
|
|
|
|
VLD1.P 16(R0), [V2.D1, V3.D1]
|
|
VLD1.P 16(R3), [V25.D1, V26.D1]
|
|
VEOR V25.B16, V2.B16, V2.B16
|
|
VEOR V26.B16, V3.B16, V3.B16
|
|
|
|
VLD1.P 16(R0), [V4.D1, V5.D1]
|
|
VLD1.P 16(R3), [V25.D1, V26.D1]
|
|
VEOR V25.B16, V4.B16, V4.B16
|
|
VEOR V26.B16, V5.B16, V5.B16
|
|
|
|
VLD1.P 16(R0), [V6.D1, V7.D1]
|
|
VLD1.P 16(R3), [V25.D1, V26.D1]
|
|
VEOR V25.B16, V6.B16, V6.B16
|
|
VEOR V26.B16, V7.B16, V7.B16
|
|
|
|
VLD1.P 16(R0), [V8.D1, V9.D1]
|
|
VLD1.P 16(R3), [V25.D1, V26.D1]
|
|
VEOR V25.B16, V8.B16, V8.B16
|
|
VEOR V26.B16, V9.B16, V9.B16
|
|
|
|
VLD1.P 16(R0), [V10.D1, V11.D1]
|
|
VLD1.P 16(R3), [V25.D1, V26.D1]
|
|
VEOR V25.B16, V10.B16, V10.B16
|
|
VEOR V26.B16, V11.B16, V11.B16
|
|
|
|
VLD1.P 16(R0), [V12.D1, V13.D1]
|
|
VLD1.P 16(R3), [V25.D1, V26.D1]
|
|
VEOR V25.B16, V12.B16, V12.B16
|
|
VEOR V26.B16, V13.B16, V13.B16
|
|
|
|
VLD1.P 16(R0), [V14.D1, V15.D1]
|
|
VLD1.P 16(R3), [V25.D1, V26.D1]
|
|
VEOR V25.B16, V14.B16, V14.B16
|
|
VEOR V26.B16, V15.B16, V15.B16
|
|
|
|
// Lane 16: last data lane (8 bytes at buf offset 128)
|
|
VLD1.P 16(R0), [V16.D1, V17.D1]
|
|
VLD1 (R3), [V25.D1]
|
|
VEOR V25.B16, V16.B16, V16.B16
|
|
|
|
// Remaining state lanes 18-24 (no data to XOR)
|
|
VLD1.P 16(R0), [V18.D1, V19.D1]
|
|
VLD1.P 16(R0), [V20.D1, V21.D1]
|
|
VLD1.P 16(R0), [V22.D1, V23.D1]
|
|
VLD1 (R0), [V24.D1]
|
|
|
|
SUB $192, R0, R0
|
|
B rounds
|
|
|
|
load_state:
|
|
VLD1.P 16(R0), [V0.D1, V1.D1]
|
|
VLD1.P 16(R0), [V2.D1, V3.D1]
|
|
VLD1.P 16(R0), [V4.D1, V5.D1]
|
|
VLD1.P 16(R0), [V6.D1, V7.D1]
|
|
VLD1.P 16(R0), [V8.D1, V9.D1]
|
|
VLD1.P 16(R0), [V10.D1, V11.D1]
|
|
VLD1.P 16(R0), [V12.D1, V13.D1]
|
|
VLD1.P 16(R0), [V14.D1, V15.D1]
|
|
VLD1.P 16(R0), [V16.D1, V17.D1]
|
|
VLD1.P 16(R0), [V18.D1, V19.D1]
|
|
VLD1.P 16(R0), [V20.D1, V21.D1]
|
|
VLD1.P 16(R0), [V22.D1, V23.D1]
|
|
VLD1 (R0), [V24.D1]
|
|
|
|
SUB $192, R0, R0
|
|
|
|
rounds:
|
|
// theta
|
|
VEOR3 V20.B16, V15.B16, V10.B16, V25.B16
|
|
VEOR3 V21.B16, V16.B16, V11.B16, V26.B16
|
|
VEOR3 V22.B16, V17.B16, V12.B16, V27.B16
|
|
VEOR3 V23.B16, V18.B16, V13.B16, V28.B16
|
|
VEOR3 V24.B16, V19.B16, V14.B16, V29.B16
|
|
VEOR3 V25.B16, V5.B16, V0.B16, V25.B16
|
|
VEOR3 V26.B16, V6.B16, V1.B16, V26.B16
|
|
VEOR3 V27.B16, V7.B16, V2.B16, V27.B16
|
|
VEOR3 V28.B16, V8.B16, V3.B16, V28.B16
|
|
VEOR3 V29.B16, V9.B16, V4.B16, V29.B16
|
|
|
|
VRAX1 V27.D2, V25.D2, V30.D2
|
|
VRAX1 V28.D2, V26.D2, V31.D2
|
|
VRAX1 V29.D2, V27.D2, V27.D2
|
|
VRAX1 V25.D2, V28.D2, V28.D2
|
|
VRAX1 V26.D2, V29.D2, V29.D2
|
|
|
|
// theta and rho and Pi
|
|
VEOR V29.B16, V0.B16, V0.B16
|
|
|
|
VXAR $63, V30.D2, V1.D2, V25.D2
|
|
|
|
VXAR $20, V30.D2, V6.D2, V1.D2
|
|
VXAR $44, V28.D2, V9.D2, V6.D2
|
|
VXAR $3, V31.D2, V22.D2, V9.D2
|
|
VXAR $25, V28.D2, V14.D2, V22.D2
|
|
VXAR $46, V29.D2, V20.D2, V14.D2
|
|
|
|
VXAR $2, V31.D2, V2.D2, V26.D2
|
|
|
|
VXAR $21, V31.D2, V12.D2, V2.D2
|
|
VXAR $39, V27.D2, V13.D2, V12.D2
|
|
VXAR $56, V28.D2, V19.D2, V13.D2
|
|
VXAR $8, V27.D2, V23.D2, V19.D2
|
|
VXAR $23, V29.D2, V15.D2, V23.D2
|
|
|
|
VXAR $37, V28.D2, V4.D2, V15.D2
|
|
|
|
VXAR $50, V28.D2, V24.D2, V28.D2
|
|
VXAR $62, V30.D2, V21.D2, V24.D2
|
|
VXAR $9, V27.D2, V8.D2, V8.D2
|
|
VXAR $19, V30.D2, V16.D2, V4.D2
|
|
VXAR $28, V29.D2, V5.D2, V16.D2
|
|
|
|
VXAR $36, V27.D2, V3.D2, V5.D2
|
|
|
|
VXAR $43, V27.D2, V18.D2, V27.D2
|
|
VXAR $49, V31.D2, V17.D2, V3.D2
|
|
VXAR $54, V30.D2, V11.D2, V30.D2
|
|
VXAR $58, V31.D2, V7.D2, V31.D2
|
|
VXAR $61, V29.D2, V10.D2, V29.D2
|
|
|
|
// chi and iota
|
|
VBCAX V8.B16, V22.B16, V26.B16, V20.B16
|
|
VBCAX V22.B16, V23.B16, V8.B16, V21.B16
|
|
VBCAX V23.B16, V24.B16, V22.B16, V22.B16
|
|
VBCAX V24.B16, V26.B16, V23.B16, V23.B16
|
|
VBCAX V26.B16, V8.B16, V24.B16, V24.B16
|
|
|
|
VLD1R.P 8(R1), [V26.D2]
|
|
|
|
VBCAX V3.B16, V19.B16, V30.B16, V17.B16
|
|
VBCAX V19.B16, V15.B16, V3.B16, V18.B16
|
|
VBCAX V15.B16, V16.B16, V19.B16, V19.B16
|
|
VBCAX V16.B16, V30.B16, V15.B16, V15.B16
|
|
VBCAX V30.B16, V3.B16, V16.B16, V16.B16
|
|
|
|
VBCAX V31.B16, V12.B16, V25.B16, V10.B16
|
|
VBCAX V12.B16, V13.B16, V31.B16, V11.B16
|
|
VBCAX V13.B16, V14.B16, V12.B16, V12.B16
|
|
VBCAX V14.B16, V25.B16, V13.B16, V13.B16
|
|
VBCAX V25.B16, V31.B16, V14.B16, V14.B16
|
|
|
|
VBCAX V4.B16, V9.B16, V29.B16, V7.B16
|
|
VBCAX V9.B16, V5.B16, V4.B16, V8.B16
|
|
VBCAX V5.B16, V6.B16, V9.B16, V9.B16
|
|
VBCAX V6.B16, V29.B16, V5.B16, V5.B16
|
|
VBCAX V29.B16, V4.B16, V6.B16, V6.B16
|
|
|
|
VBCAX V28.B16, V0.B16, V27.B16, V3.B16
|
|
VBCAX V0.B16, V1.B16, V28.B16, V4.B16
|
|
|
|
VBCAX V1.B16, V2.B16, V0.B16, V0.B16 // iota (chi part)
|
|
|
|
VBCAX V2.B16, V27.B16, V1.B16, V1.B16
|
|
VBCAX V27.B16, V28.B16, V2.B16, V2.B16
|
|
|
|
VEOR V26.B16, V0.B16, V0.B16 // iota
|
|
|
|
SUB $1, R2, R2
|
|
CBNZ R2, rounds
|
|
|
|
VST1.P [V0.D1, V1.D1], 16(R0)
|
|
VST1.P [V2.D1, V3.D1], 16(R0)
|
|
VST1.P [V4.D1, V5.D1], 16(R0)
|
|
VST1.P [V6.D1, V7.D1], 16(R0)
|
|
VST1.P [V8.D1, V9.D1], 16(R0)
|
|
VST1.P [V10.D1, V11.D1], 16(R0)
|
|
VST1.P [V12.D1, V13.D1], 16(R0)
|
|
VST1.P [V14.D1, V15.D1], 16(R0)
|
|
VST1.P [V16.D1, V17.D1], 16(R0)
|
|
VST1.P [V18.D1, V19.D1], 16(R0)
|
|
VST1.P [V20.D1, V21.D1], 16(R0)
|
|
VST1.P [V22.D1, V23.D1], 16(R0)
|
|
VST1 [V24.D1], (R0)
|
|
|
|
RET
|
|
|
|
DATA round_consts<>+0x00(SB)/8, $0x0000000000000001
|
|
DATA round_consts<>+0x08(SB)/8, $0x0000000000008082
|
|
DATA round_consts<>+0x10(SB)/8, $0x800000000000808a
|
|
DATA round_consts<>+0x18(SB)/8, $0x8000000080008000
|
|
DATA round_consts<>+0x20(SB)/8, $0x000000000000808b
|
|
DATA round_consts<>+0x28(SB)/8, $0x0000000080000001
|
|
DATA round_consts<>+0x30(SB)/8, $0x8000000080008081
|
|
DATA round_consts<>+0x38(SB)/8, $0x8000000000008009
|
|
DATA round_consts<>+0x40(SB)/8, $0x000000000000008a
|
|
DATA round_consts<>+0x48(SB)/8, $0x0000000000000088
|
|
DATA round_consts<>+0x50(SB)/8, $0x0000000080008009
|
|
DATA round_consts<>+0x58(SB)/8, $0x000000008000000a
|
|
DATA round_consts<>+0x60(SB)/8, $0x000000008000808b
|
|
DATA round_consts<>+0x68(SB)/8, $0x800000000000008b
|
|
DATA round_consts<>+0x70(SB)/8, $0x8000000000008089
|
|
DATA round_consts<>+0x78(SB)/8, $0x8000000000008003
|
|
DATA round_consts<>+0x80(SB)/8, $0x8000000000008002
|
|
DATA round_consts<>+0x88(SB)/8, $0x8000000000000080
|
|
DATA round_consts<>+0x90(SB)/8, $0x000000000000800a
|
|
DATA round_consts<>+0x98(SB)/8, $0x800000008000000a
|
|
DATA round_consts<>+0xA0(SB)/8, $0x8000000080008081
|
|
DATA round_consts<>+0xA8(SB)/8, $0x8000000000008080
|
|
DATA round_consts<>+0xB0(SB)/8, $0x0000000080000001
|
|
DATA round_consts<>+0xB8(SB)/8, $0x8000000080008008
|
|
GLOBL round_consts<>(SB), NOPTR|RODATA, $192
|