mirror of
git://sourceware.org/git/glibc.git
synced 2024-12-09 04:11:27 +08:00
x86: Optimize L(less_vec) case in memcmpeq-evex.S
No bug. Optimizations are twofold. 1) Replace page cross and 0/1 checks with masked load instructions in L(less_vec). In applications this reduces branch-misses in the hot [0, 32] case. 2) Change controlflow so that L(less_vec) case gets the fall through. Change 2) helps copies in the [0, 32] size range but comes at the cost of copies in the [33, 64] size range. From profiles of GCC and Python3, 94%+ and 99%+ of calls are in the [0, 32] range so this appears to the the right tradeoff. Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
This commit is contained in:
parent
abddd61de0
commit
cca457f9c5
@ -39,6 +39,7 @@
|
||||
# define MEMCMPEQ __memcmpeq_evex
|
||||
# endif
|
||||
|
||||
# define VMOVU_MASK vmovdqu8
|
||||
# define VMOVU vmovdqu64
|
||||
# define VPCMP vpcmpub
|
||||
# define VPTEST vptestmb
|
||||
@ -62,12 +63,39 @@ ENTRY_P2ALIGN (MEMCMPEQ, 6)
|
||||
movl %edx, %edx
|
||||
# endif
|
||||
cmp $VEC_SIZE, %RDX_LP
|
||||
jb L(less_vec)
|
||||
/* Fall through for [0, VEC_SIZE] as its the hottest. */
|
||||
ja L(more_1x_vec)
|
||||
|
||||
/* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
|
||||
/* Create mask of bytes that are guranteed to be valid because
|
||||
of length (edx). Using masked movs allows us to skip checks for
|
||||
page crosses/zero size. */
|
||||
movl $-1, %ecx
|
||||
bzhil %edx, %ecx, %ecx
|
||||
kmovd %ecx, %k2
|
||||
|
||||
/* Use masked loads as VEC_SIZE could page cross where length
|
||||
(edx) would not. */
|
||||
VMOVU_MASK (%rsi), %YMM2{%k2}
|
||||
VPCMP $4,(%rdi), %YMM2, %k1{%k2}
|
||||
kmovd %k1, %eax
|
||||
ret
|
||||
|
||||
|
||||
L(last_1x_vec):
|
||||
VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM1
|
||||
VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %k1
|
||||
kmovd %k1, %eax
|
||||
L(return_neq0):
|
||||
ret
|
||||
|
||||
|
||||
|
||||
.p2align 4
|
||||
L(more_1x_vec):
|
||||
/* From VEC + 1 to 2 * VEC. */
|
||||
VMOVU (%rsi), %YMM1
|
||||
/* Use compare not equals to directly check for mismatch. */
|
||||
VPCMP $4, (%rdi), %YMM1, %k1
|
||||
VPCMP $4,(%rdi), %YMM1, %k1
|
||||
kmovd %k1, %eax
|
||||
testl %eax, %eax
|
||||
jnz L(return_neq0)
|
||||
@ -88,13 +116,13 @@ ENTRY_P2ALIGN (MEMCMPEQ, 6)
|
||||
|
||||
/* Check third and fourth VEC no matter what. */
|
||||
VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
|
||||
VPCMP $4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1
|
||||
VPCMP $4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1
|
||||
kmovd %k1, %eax
|
||||
testl %eax, %eax
|
||||
jnz L(return_neq0)
|
||||
|
||||
VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
|
||||
VPCMP $4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1
|
||||
VPCMP $4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1
|
||||
kmovd %k1, %eax
|
||||
testl %eax, %eax
|
||||
jnz L(return_neq0)
|
||||
@ -132,66 +160,6 @@ ENTRY_P2ALIGN (MEMCMPEQ, 6)
|
||||
/* Compare YMM4 with 0. If any 1s s1 and s2 don't match. */
|
||||
VPTEST %YMM4, %YMM4, %k1
|
||||
kmovd %k1, %eax
|
||||
L(return_neq0):
|
||||
ret
|
||||
|
||||
/* Fits in padding needed to .p2align 5 L(less_vec). */
|
||||
L(last_1x_vec):
|
||||
VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM1
|
||||
VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %k1
|
||||
kmovd %k1, %eax
|
||||
ret
|
||||
|
||||
/* NB: p2align 5 here will ensure the L(loop_4x_vec) is also 32
|
||||
byte aligned. */
|
||||
.p2align 5
|
||||
L(less_vec):
|
||||
/* Check if one or less char. This is necessary for size = 0 but
|
||||
is also faster for size = 1. */
|
||||
cmpl $1, %edx
|
||||
jbe L(one_or_less)
|
||||
|
||||
/* Check if loading one VEC from either s1 or s2 could cause a
|
||||
page cross. This can have false positives but is by far the
|
||||
fastest method. */
|
||||
movl %edi, %eax
|
||||
orl %esi, %eax
|
||||
andl $(PAGE_SIZE - 1), %eax
|
||||
cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
||||
jg L(page_cross_less_vec)
|
||||
|
||||
/* No page cross possible. */
|
||||
VMOVU (%rsi), %YMM2
|
||||
VPCMP $4, (%rdi), %YMM2, %k1
|
||||
kmovd %k1, %eax
|
||||
/* Result will be zero if s1 and s2 match. Otherwise first set
|
||||
bit will be first mismatch. */
|
||||
bzhil %edx, %eax, %eax
|
||||
ret
|
||||
|
||||
/* Relatively cold but placing close to L(less_vec) for 2 byte
|
||||
jump encoding. */
|
||||
.p2align 4
|
||||
L(one_or_less):
|
||||
jb L(zero)
|
||||
movzbl (%rsi), %ecx
|
||||
movzbl (%rdi), %eax
|
||||
subl %ecx, %eax
|
||||
/* No ymm register was touched. */
|
||||
ret
|
||||
/* Within the same 16 byte block is L(one_or_less). */
|
||||
L(zero):
|
||||
xorl %eax, %eax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(last_2x_vec):
|
||||
VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %YMM1
|
||||
vpxorq -(VEC_SIZE * 2)(%rdi, %rdx), %YMM1, %YMM1
|
||||
VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM2
|
||||
vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %YMM2
|
||||
VPTEST %YMM2, %YMM2, %k1
|
||||
kmovd %k1, %eax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
@ -211,7 +179,7 @@ L(loop_4x_vec):
|
||||
vpxorq (%rdi), %YMM1, %YMM1
|
||||
|
||||
VMOVU VEC_SIZE(%rsi, %rdi), %YMM2
|
||||
vpternlogd $0xde, (VEC_SIZE)(%rdi), %YMM1, %YMM2
|
||||
vpternlogd $0xde,(VEC_SIZE)(%rdi), %YMM1, %YMM2
|
||||
|
||||
VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3
|
||||
vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
|
||||
@ -238,7 +206,7 @@ L(loop_4x_vec):
|
||||
VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %YMM3
|
||||
/* Ternary logic to xor (VEC_SIZE * 2)(%rdx) with YMM3 while
|
||||
oring with YMM4. Result is stored in YMM4. */
|
||||
vpternlogd $0xf6, (VEC_SIZE * 2)(%rdx), %YMM3, %YMM4
|
||||
vpternlogd $0xf6,(VEC_SIZE * 2)(%rdx), %YMM3, %YMM4
|
||||
cmpl $(VEC_SIZE * 2), %edi
|
||||
jae L(8x_last_2x_vec)
|
||||
|
||||
@ -256,68 +224,16 @@ L(8x_last_2x_vec):
|
||||
L(return_neq2):
|
||||
ret
|
||||
|
||||
/* Relatively cold case as page cross are unexpected. */
|
||||
.p2align 4
|
||||
L(page_cross_less_vec):
|
||||
cmpl $16, %edx
|
||||
jae L(between_16_31)
|
||||
cmpl $8, %edx
|
||||
ja L(between_9_15)
|
||||
cmpl $4, %edx
|
||||
jb L(between_2_3)
|
||||
/* From 4 to 8 bytes. No branch when size == 4. */
|
||||
movl (%rdi), %eax
|
||||
subl (%rsi), %eax
|
||||
movl -4(%rdi, %rdx), %ecx
|
||||
movl -4(%rsi, %rdx), %edi
|
||||
subl %edi, %ecx
|
||||
orl %ecx, %eax
|
||||
ret
|
||||
|
||||
.p2align 4,, 8
|
||||
L(between_16_31):
|
||||
/* From 16 to 31 bytes. No branch when size == 16. */
|
||||
|
||||
/* Safe to use xmm[0, 15] as no vzeroupper is needed so RTM safe.
|
||||
*/
|
||||
vmovdqu (%rsi), %xmm1
|
||||
vpcmpeqb (%rdi), %xmm1, %xmm1
|
||||
vmovdqu -16(%rsi, %rdx), %xmm2
|
||||
vpcmpeqb -16(%rdi, %rdx), %xmm2, %xmm2
|
||||
vpand %xmm1, %xmm2, %xmm2
|
||||
vpmovmskb %xmm2, %eax
|
||||
notw %ax
|
||||
/* No ymm register was touched. */
|
||||
L(last_2x_vec):
|
||||
VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %YMM1
|
||||
vpxorq -(VEC_SIZE * 2)(%rdi, %rdx), %YMM1, %YMM1
|
||||
VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM2
|
||||
vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %YMM2
|
||||
VPTEST %YMM2, %YMM2, %k1
|
||||
kmovd %k1, %eax
|
||||
ret
|
||||
|
||||
.p2align 4,, 8
|
||||
L(between_9_15):
|
||||
/* From 9 to 15 bytes. */
|
||||
movq (%rdi), %rax
|
||||
subq (%rsi), %rax
|
||||
movq -8(%rdi, %rdx), %rcx
|
||||
movq -8(%rsi, %rdx), %rdi
|
||||
subq %rdi, %rcx
|
||||
orq %rcx, %rax
|
||||
/* edx is guranteed to be a non-zero int. */
|
||||
cmovnz %edx, %eax
|
||||
ret
|
||||
|
||||
/* Don't align. This is cold and aligning here will cause code
|
||||
to spill into next cache line. */
|
||||
L(between_2_3):
|
||||
/* From 2 to 3 bytes. No branch when size == 2. */
|
||||
movzwl (%rdi), %eax
|
||||
movzwl (%rsi), %ecx
|
||||
subl %ecx, %eax
|
||||
movzbl -1(%rdi, %rdx), %ecx
|
||||
/* All machines that support evex will insert a "merging uop"
|
||||
avoiding any serious partial register stalls. */
|
||||
subb -1(%rsi, %rdx), %cl
|
||||
orl %ecx, %eax
|
||||
/* No ymm register was touched. */
|
||||
ret
|
||||
|
||||
/* 4 Bytes from next cache line. */
|
||||
/* 1 Bytes from next cache line. */
|
||||
END (MEMCMPEQ)
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user