common/bitutil: optimize ORBytes

This commit is contained in:
cuiweixie 2025-11-30 18:56:59 +08:00
parent a9eaf2ffd8
commit da18f7dddf
No known key found for this signature in database
GPG key ID: 16DF64EE15E495A3
5 changed files with 188 additions and 26 deletions

View file

@ -77,32 +77,7 @@ func safeANDBytes(dst, a, b []byte) int {
// ORBytes ors the bytes in a and b. The destination is assumed to have enough
// space. Returns the number of bytes or'd.
func ORBytes(dst, a, b []byte) int {
if supportsUnaligned {
return fastORBytes(dst, a, b)
}
return safeORBytes(dst, a, b)
}
// fastORBytes ors in bulk. It only works on architectures that support
// unaligned read/writes.
func fastORBytes(dst, a, b []byte) int {
n := len(a)
if len(b) < n {
n = len(b)
}
w := n / wordSize
if w > 0 {
dw := *(*[]uintptr)(unsafe.Pointer(&dst))
aw := *(*[]uintptr)(unsafe.Pointer(&a))
bw := *(*[]uintptr)(unsafe.Pointer(&b))
for i := 0; i < w; i++ {
dw[i] = aw[i] | bw[i]
}
}
for i := n - n%wordSize; i < n; i++ {
dst[i] = a[i] | b[i]
}
return n
return orBytes(dst, a, b)
}
// safeORBytes ors one by one. It works on all architectures, independent if

59
common/bitutil/or_amd64.s Normal file
View file

@ -0,0 +1,59 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// inspired by: https://github.com/golang/go/blob/4a3cef2036097d323b6cc0bbe90fc4d8c7588660/src/crypto/internal/fips140/subtle/xor_amd64.s
//go:build !purego
#include "textflag.h"
// func orBytesASM(dst, a, b *byte, n int)
TEXT ·orBytesASM(SB), NOSPLIT, $0
MOVQ dst+0(FP), BX
MOVQ a+8(FP), SI
MOVQ b+16(FP), CX
MOVQ n+24(FP), DX
TESTQ $15, DX // AND 15 & len, if not zero jump to not_aligned.
JNZ not_aligned
aligned:
MOVQ $0, AX // position in slices
PCALIGN $16
loop16b:
MOVOU (SI)(AX*1), X0 // OR 16byte forwards.
MOVOU (CX)(AX*1), X1
POR X1, X0
MOVOU X0, (BX)(AX*1)
ADDQ $16, AX
CMPQ DX, AX
JNE loop16b
RET
PCALIGN $16
loop_1b:
SUBQ $1, DX // OR 1byte backwards.
MOVB (SI)(DX*1), DI
MOVB (CX)(DX*1), AX
ORB AX, DI
MOVB DI, (BX)(DX*1)
TESTQ $7, DX // AND 7 & len, if not zero jump to loop_1b.
JNZ loop_1b
CMPQ DX, $0 // if len is 0, ret.
JE ret
TESTQ $15, DX // AND 15 & len, if zero jump to aligned.
JZ aligned
not_aligned:
TESTQ $7, DX // AND $7 & len, if not zero jump to loop_1b.
JNE loop_1b
SUBQ $8, DX // OR 8bytes backwards.
MOVQ (SI)(DX*1), DI
MOVQ (CX)(DX*1), AX
ORQ AX, DI
MOVQ DI, (BX)(DX*1)
CMPQ DX, $16 // if len is greater or equal 16 here, it must be aligned.
JGE aligned
ret:
RET

70
common/bitutil/or_arm64.s Normal file
View file

@ -0,0 +1,70 @@
// Copyright 2020 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// inspired by: https://github.com/golang/go/blob/4a3cef2036097d323b6cc0bbe90fc4d8c7588660/src/crypto/internal/fips140/subtle/xor_arm64.s
//go:build !purego
#include "textflag.h"
// func orBytesASM(dst, a, b *byte, n int)
TEXT ·orBytesASM(SB), NOSPLIT|NOFRAME, $0
MOVD dst+0(FP), R0
MOVD a+8(FP), R1
MOVD b+16(FP), R2
MOVD n+24(FP), R3
CMP $64, R3
BLT tail
loop_64:
VLD1.P 64(R1), [V0.B16, V1.B16, V2.B16, V3.B16]
VLD1.P 64(R2), [V4.B16, V5.B16, V6.B16, V7.B16]
VORR V0.B16, V4.B16, V4.B16
VORR V1.B16, V5.B16, V5.B16
VORR V2.B16, V6.B16, V6.B16
VORR V3.B16, V7.B16, V7.B16
VST1.P [V4.B16, V5.B16, V6.B16, V7.B16], 64(R0)
SUBS $64, R3
CMP $64, R3
BGE loop_64
tail:
// quick end
CBZ R3, end
TBZ $5, R3, less_than32
VLD1.P 32(R1), [V0.B16, V1.B16]
VLD1.P 32(R2), [V2.B16, V3.B16]
VORR V0.B16, V2.B16, V2.B16
VORR V1.B16, V3.B16, V3.B16
VST1.P [V2.B16, V3.B16], 32(R0)
less_than32:
TBZ $4, R3, less_than16
LDP.P 16(R1), (R11, R12)
LDP.P 16(R2), (R13, R14)
ORR R11, R13, R13
ORR R12, R14, R14
STP.P (R13, R14), 16(R0)
less_than16:
TBZ $3, R3, less_than8
MOVD.P 8(R1), R11
MOVD.P 8(R2), R12
ORR R11, R12, R12
MOVD.P R12, 8(R0)
less_than8:
TBZ $2, R3, less_than4
MOVWU.P 4(R1), R13
MOVWU.P 4(R2), R14
ORRW R13, R14, R14
MOVWU.P R14, 4(R0)
less_than4:
TBZ $1, R3, less_than2
MOVHU.P 2(R1), R15
MOVHU.P 2(R2), R16
ORRW R15, R16, R16
MOVHU.P R16, 2(R0)
less_than2:
TBZ $0, R3, end
MOVBU (R1), R17
MOVBU (R2), R19
ORRW R17, R19, R19
MOVBU R19, (R0)
end:
RET

20
common/bitutil/or_asm.go Normal file
View file

@ -0,0 +1,20 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// inspired by: https://github.com/golang/go/blob/4a3cef2036097d323b6cc0bbe90fc4d8c7588660/src/crypto/internal/fips140/subtle/xor_asm.go
//go:build (amd64 || arm64) && !purego
package bitutil
func orBytes(dst, a, b []byte) int {
n := min(len(a), len(b))
if n == 0 {
return 0
}
orBytesASM(&dst[0], &a[0], &b[0], n)
return n
}
//go:noescape
func orBytesASM(dst, a, b *byte, n int)

View file

@ -0,0 +1,38 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build (!amd64 && !arm64) || purego
package bitutil
import "unsafe"
func orBytes(dst, a, b []byte) int {
if supportsUnaligned {
return fastORBytes(dst, a, b)
}
return safeORBytes(dst, a, b)
}
// fastORBytes ors in bulk. It only works on architectures that support
// unaligned read/writes.
func fastORBytes(dst, a, b []byte) int {
n := len(a)
if len(b) < n {
n = len(b)
}
w := n / wordSize
if w > 0 {
dw := *(*[]uintptr)(unsafe.Pointer(&dst))
aw := *(*[]uintptr)(unsafe.Pointer(&a))
bw := *(*[]uintptr)(unsafe.Pointer(&b))
for i := 0; i < w; i++ {
dw[i] = aw[i] | bw[i]
}
}
for i := n - n%wordSize; i < n; i++ {
dst[i] = a[i] | b[i]
}
return n
}