mirror of
git://sourceware.org/git/glibc.git
synced 2025-02-23 13:09:58 +08:00
Fix misspellings in sysdeps/x86_64 -- BZ 25337.
Applying this commit results in bit-identical rebuild of libc.so.6 math/libm.so.6 elf/ld-linux-x86-64.so.2 mathvec/libmvec.so.1 Reviewed-by: Florian Weimer <fweimer@redhat.com>
This commit is contained in:
parent
ec9a66cd01
commit
1e9d5987fd
@ -28,11 +28,11 @@
|
||||
# undef BASE
|
||||
|
||||
# if (STATE_SAVE_ALIGNMENT % 16) != 0
|
||||
# error STATE_SAVE_ALIGNMENT must be multples of 16
|
||||
# error STATE_SAVE_ALIGNMENT must be multiple of 16
|
||||
# endif
|
||||
|
||||
# if (STATE_SAVE_OFFSET % STATE_SAVE_ALIGNMENT) != 0
|
||||
# error STATE_SAVE_OFFSET must be multples of STATE_SAVE_ALIGNMENT
|
||||
# error STATE_SAVE_OFFSET must be multiple of STATE_SAVE_ALIGNMENT
|
||||
# endif
|
||||
|
||||
# if DL_RUNTIME_RESOLVE_REALIGN_STACK
|
||||
@ -43,7 +43,7 @@
|
||||
/* Use fxsave to save XMM registers. */
|
||||
# define REGISTER_SAVE_AREA (512 + STATE_SAVE_OFFSET)
|
||||
# if (REGISTER_SAVE_AREA % 16) != 0
|
||||
# error REGISTER_SAVE_AREA must be multples of 16
|
||||
# error REGISTER_SAVE_AREA must be multiple of 16
|
||||
# endif
|
||||
# endif
|
||||
# else
|
||||
@ -57,7 +57,7 @@
|
||||
# define LOCAL_STORAGE_AREA REGISTER_SAVE_AREA
|
||||
# define BASE rsp
|
||||
# if (REGISTER_SAVE_AREA % 16) != 8
|
||||
# error REGISTER_SAVE_AREA must be odd multples of 8
|
||||
# error REGISTER_SAVE_AREA must be odd multiple of 8
|
||||
# endif
|
||||
# endif
|
||||
|
||||
@ -161,7 +161,7 @@ _dl_runtime_resolve:
|
||||
|
||||
#if !defined PROF && defined _dl_runtime_profile
|
||||
# if (LR_VECTOR_OFFSET % VEC_SIZE) != 0
|
||||
# error LR_VECTOR_OFFSET must be multples of VEC_SIZE
|
||||
# error LR_VECTOR_OFFSET must be multiple of VEC_SIZE
|
||||
# endif
|
||||
|
||||
.globl _dl_runtime_profile
|
||||
@ -173,7 +173,7 @@ _dl_runtime_profile:
|
||||
cfi_adjust_cfa_offset(16) # Incorporate PLT
|
||||
_CET_ENDBR
|
||||
/* The La_x86_64_regs data structure pointed to by the
|
||||
fourth paramater must be VEC_SIZE-byte aligned. This must
|
||||
fourth parameter must be VEC_SIZE-byte aligned. This must
|
||||
be explicitly enforced. We have the set up a dynamically
|
||||
sized stack frame. %rbx points to the top half which
|
||||
has a fixed size and preserves the original stack pointer. */
|
||||
|
@ -31,7 +31,7 @@ __feupdateenv (const fenv_t *envp)
|
||||
/* Install new environment. */
|
||||
__fesetenv (envp);
|
||||
|
||||
/* Raise the saved exception. Incidently for us the implementation
|
||||
/* Raise the saved exception. Incidentally for us the implementation
|
||||
defined format of the values in objects of type fexcept_t is the
|
||||
same as the ones specified using the FE_* constants. */
|
||||
__feraiseexcept ((int) temp);
|
||||
|
@ -1,4 +1,4 @@
|
||||
/* Common definition for strcasecmp famly ifunc selections.
|
||||
/* Common definition for strcasecmp family ifunc selections.
|
||||
All versions must be listed in ifunc-impl-list.c.
|
||||
Copyright (C) 2017-2023 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
@ -440,13 +440,13 @@ L(loop_4x_vec):
|
||||
ymm0-15 is used at all is because there is no EVEX encoding
|
||||
vpcmpeq and with vpcmpeq this loop can be performed more
|
||||
efficiently. The non-vzeroupper version is safe for RTM
|
||||
while the vzeroupper version should be prefered if RTM are
|
||||
while the vzeroupper version should be preferred if RTM are
|
||||
not supported. Which loop version we use is determined by
|
||||
USE_TERN_IN_LOOP. */
|
||||
|
||||
# if USE_TERN_IN_LOOP
|
||||
/* Since vptern can only take 3x vectors fastest to do 1 vec
|
||||
seperately with EVEX vpcmp. */
|
||||
separately with EVEX vpcmp. */
|
||||
# ifdef USE_AS_WMEMCHR
|
||||
/* vptern can only accept masks for epi32/epi64 so can only save
|
||||
instruction using not equals mask on vptern with wmemchr.
|
||||
@ -539,7 +539,7 @@ L(last_vec_x1_novzero):
|
||||
|
||||
# if CHAR_PER_VEC == 64
|
||||
/* Since we can't combine the last 2x VEC when CHAR_PER_VEC ==
|
||||
64 it needs a seperate return label. */
|
||||
64 it needs a separate return label. */
|
||||
.p2align 4,, 4
|
||||
L(last_vec_x2):
|
||||
L(last_vec_x2_novzero):
|
||||
@ -579,8 +579,8 @@ L(loop_vec_ret):
|
||||
(only if used VEX encoded loop). */
|
||||
COND_VZEROUPPER
|
||||
|
||||
/* Seperate logic for CHAR_PER_VEC == 64 vs the rest. For
|
||||
CHAR_PER_VEC we test the last 2x VEC seperately, for
|
||||
/* Separate logic for CHAR_PER_VEC == 64 vs the rest. For
|
||||
CHAR_PER_VEC we test the last 2x VEC separately, for
|
||||
CHAR_PER_VEC <= 32 we can combine the results from the 2x
|
||||
VEC in a single GPR. */
|
||||
# if CHAR_PER_VEC == 64
|
||||
|
@ -29,7 +29,7 @@
|
||||
3. Use xmm vector compare when size >= 4 bytes for memcmp or
|
||||
size >= 8 bytes for wmemcmp.
|
||||
4. Optimistically compare up to first 4 * VEC_SIZE one at a
|
||||
to check for early mismatches. Only do this if its guranteed the
|
||||
to check for early mismatches. Only do this if its guaranteed the
|
||||
work is not wasted.
|
||||
5. If size is 8 * VEC_SIZE or less, unroll the loop.
|
||||
6. Compare 4 * VEC_SIZE at a time with the aligned first memory
|
||||
@ -66,7 +66,7 @@
|
||||
|
||||
/* Warning!
|
||||
wmemcmp has to use SIGNED comparison for elements.
|
||||
memcmp has to use UNSIGNED comparison for elemnts.
|
||||
memcmp has to use UNSIGNED comparison for elements.
|
||||
*/
|
||||
|
||||
.section SECTION(.text),"ax",@progbits
|
||||
|
@ -30,7 +30,7 @@
|
||||
3. Use xmm vector compare when size >= 4 bytes for memcmp or
|
||||
size >= 8 bytes for wmemcmp.
|
||||
4. Optimistically compare up to first 4 * CHAR_PER_VEC one at a
|
||||
to check for early mismatches. Only do this if its guranteed the
|
||||
to check for early mismatches. Only do this if its guaranteed the
|
||||
work is not wasted.
|
||||
5. If size is 8 * VEC_SIZE or less, unroll the loop.
|
||||
6. Compare 4 * VEC_SIZE at a time with the aligned first memory
|
||||
@ -90,7 +90,7 @@ Latency:
|
||||
|
||||
/* Warning!
|
||||
wmemcmp has to use SIGNED comparison for elements.
|
||||
memcmp has to use UNSIGNED comparison for elemnts.
|
||||
memcmp has to use UNSIGNED comparison for elements.
|
||||
*/
|
||||
|
||||
.section SECTION(.text), "ax", @progbits
|
||||
@ -105,7 +105,7 @@ ENTRY_P2ALIGN (MEMCMP, 6)
|
||||
/* Fall through for [0, VEC_SIZE] as its the hottest. */
|
||||
ja L(more_1x_vec)
|
||||
|
||||
/* Create mask of bytes that are guranteed to be valid because
|
||||
/* Create mask of bytes that are guaranteed to be valid because
|
||||
of length (edx). Using masked movs allows us to skip checks
|
||||
for page crosses/zero size. */
|
||||
mov $-1, %VRAX
|
||||
@ -365,7 +365,7 @@ L(loop_4x_vec):
|
||||
/* Load regardless of branch. */
|
||||
VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %VMM(3)
|
||||
|
||||
/* Seperate logic as we can only use testb for VEC_SIZE == 64.
|
||||
/* Separate logic as we can only use testb for VEC_SIZE == 64.
|
||||
*/
|
||||
# if VEC_SIZE == 64
|
||||
testb %dil, %dil
|
||||
|
@ -410,7 +410,7 @@ L(ret_nonzero_vec_start_4_5):
|
||||
.p2align 4,, 8
|
||||
L(ret_nonzero_vec_end_1):
|
||||
pmovmskb %xmm1, %ecx
|
||||
/* High 16 bits of eax guranteed to be all ones. Rotate them in
|
||||
/* High 16 bits of eax guaranteed to be all ones. Rotate them in
|
||||
to we can do `or + not` with just `xor`. */
|
||||
rorl $16, %eax
|
||||
xorl %ecx, %eax
|
||||
@ -562,7 +562,7 @@ L(ret_nonzero_loop):
|
||||
sall $(VEC_SIZE * 1), %edx
|
||||
leal 1(%rcx, %rdx), %edx
|
||||
pmovmskb %xmm2, %ecx
|
||||
/* High 16 bits of eax guranteed to be all ones. Rotate them in
|
||||
/* High 16 bits of eax guaranteed to be all ones. Rotate them in
|
||||
to we can do `or + not` with just `xor`. */
|
||||
rorl $16, %eax
|
||||
xorl %ecx, %eax
|
||||
|
@ -26,7 +26,7 @@
|
||||
and loading from either s1 or s2 would cause a page cross.
|
||||
2. Use xmm vector compare when size >= 8 bytes.
|
||||
3. Optimistically compare up to first 4 * VEC_SIZE one at a
|
||||
to check for early mismatches. Only do this if its guranteed the
|
||||
to check for early mismatches. Only do this if its guaranteed the
|
||||
work is not wasted.
|
||||
4. If size is 8 * VEC_SIZE or less, unroll the loop.
|
||||
5. Compare 4 * VEC_SIZE at a time with the aligned first memory
|
||||
@ -302,7 +302,7 @@ L(between_9_15):
|
||||
movq -8(%rsi, %rdx), %rdi
|
||||
subq %rdi, %rcx
|
||||
orq %rcx, %rax
|
||||
/* edx is guranteed to be a non-zero int. */
|
||||
/* edx is guaranteed to be a non-zero int. */
|
||||
cmovnz %edx, %eax
|
||||
ret
|
||||
|
||||
|
@ -26,7 +26,7 @@
|
||||
and loading from either s1 or s2 would cause a page cross.
|
||||
2. Use xmm vector compare when size >= 8 bytes.
|
||||
3. Optimistically compare up to first 4 * VEC_SIZE one at a
|
||||
to check for early mismatches. Only do this if its guranteed the
|
||||
to check for early mismatches. Only do this if its guaranteed the
|
||||
work is not wasted.
|
||||
4. If size is 8 * VEC_SIZE or less, unroll the loop.
|
||||
5. Compare 4 * VEC_SIZE at a time with the aligned first memory
|
||||
@ -97,7 +97,7 @@ ENTRY_P2ALIGN (MEMCMPEQ, 6)
|
||||
/* Fall through for [0, VEC_SIZE] as its the hottest. */
|
||||
ja L(more_1x_vec)
|
||||
|
||||
/* Create mask of bytes that are guranteed to be valid because
|
||||
/* Create mask of bytes that are guaranteed to be valid because
|
||||
of length (edx). Using masked movs allows us to skip checks
|
||||
for page crosses/zero size. */
|
||||
mov $-1, %VRAX
|
||||
@ -253,7 +253,7 @@ L(loop_4x_vec):
|
||||
oring with VEC(4). Result is stored in VEC(4). */
|
||||
vpternlogd $0xf6, (VEC_SIZE * 2)(%rdx), %VMM(3), %VMM(4)
|
||||
|
||||
/* Seperate logic as we can only use testb for VEC_SIZE == 64.
|
||||
/* Separate logic as we can only use testb for VEC_SIZE == 64.
|
||||
*/
|
||||
# if VEC_SIZE == 64
|
||||
testb %dil, %dil
|
||||
|
@ -231,7 +231,7 @@ L(end_loop_fwd):
|
||||
movups %xmm7, 48(%r8)
|
||||
ret
|
||||
|
||||
/* Extactly 64 bytes if `jmp L(end_loop_fwd)` is long encoding.
|
||||
/* Exactly 64 bytes if `jmp L(end_loop_fwd)` is long encoding.
|
||||
60 bytes otherwise. */
|
||||
# define ALIGNED_LOOP_FWD(align_by); \
|
||||
.p2align 6; \
|
||||
@ -368,7 +368,7 @@ L(end_loop_bkwd):
|
||||
ret
|
||||
|
||||
|
||||
/* Extactly 64 bytes if `jmp L(end_loop_bkwd)` is long encoding.
|
||||
/* Exactly 64 bytes if `jmp L(end_loop_bkwd)` is long encoding.
|
||||
60 bytes otherwise. */
|
||||
# define ALIGNED_LOOP_BKWD(align_by); \
|
||||
.p2align 6; \
|
||||
|
@ -445,7 +445,7 @@ L(more_8x_vec_check):
|
||||
shrq $63, %r8
|
||||
/* Get 4k difference dst - src. */
|
||||
andl $(PAGE_SIZE - 256), %ecx
|
||||
/* If r8 is non-zero must do foward for correctness. Otherwise
|
||||
/* If r8 is non-zero must do forward for correctness. Otherwise
|
||||
if ecx is non-zero there is 4k False Alaising so do backward
|
||||
copy. */
|
||||
addl %r8d, %ecx
|
||||
@ -460,7 +460,7 @@ L(more_8x_vec_forward):
|
||||
/* First vec was already loaded into VEC(0). */
|
||||
VMOVU -VEC_SIZE(%rsi, %rdx), %VMM(5)
|
||||
VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(6)
|
||||
/* Save begining of dst. */
|
||||
/* Save beginning of dst. */
|
||||
movq %rdi, %rcx
|
||||
/* Align dst to VEC_SIZE - 1. */
|
||||
orq $(VEC_SIZE - 1), %rdi
|
||||
@ -517,7 +517,7 @@ L(more_8x_vec_backward):
|
||||
/* First vec was also loaded into VEC(0). */
|
||||
VMOVU VEC_SIZE(%rsi), %VMM(5)
|
||||
VMOVU (VEC_SIZE * 2)(%rsi), %VMM(6)
|
||||
/* Begining of region for 4x backward copy stored in rcx. */
|
||||
/* Beginning of region for 4x backward copy stored in rcx. */
|
||||
leaq (VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx
|
||||
VMOVU (VEC_SIZE * 3)(%rsi), %VMM(7)
|
||||
VMOVU -VEC_SIZE(%rsi, %rdx), %VMM(8)
|
||||
@ -611,7 +611,7 @@ L(movsb):
|
||||
movq %rdi, %r8
|
||||
# endif
|
||||
/* If above __x86_rep_movsb_stop_threshold most likely is
|
||||
candidate for NT moves aswell. */
|
||||
candidate for NT moves as well. */
|
||||
cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
|
||||
jae L(large_memcpy_2x_check)
|
||||
# if AVOID_SHORT_DISTANCE_REP_MOVSB || ALIGN_MOVSB
|
||||
|
@ -65,7 +65,7 @@ ENTRY_P2ALIGN(MEMRCHR, 6)
|
||||
|
||||
L(ret_vec_x0_test):
|
||||
/* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
|
||||
will gurantee edx (len) is less than it. */
|
||||
will guarantee edx (len) is less than it. */
|
||||
lzcntl %ecx, %ecx
|
||||
|
||||
/* Hoist vzeroupper (not great for RTM) to save code size. This allows
|
||||
@ -233,7 +233,7 @@ L(more_4x_vec):
|
||||
jnz L(ret_vec_x3)
|
||||
|
||||
/* Check if near end before re-aligning (otherwise might do an
|
||||
unnecissary loop iteration). */
|
||||
unnecessary loop iteration). */
|
||||
addq $-(VEC_SIZE * 4), %rax
|
||||
cmpq $(VEC_SIZE * 4), %rdx
|
||||
jbe L(last_4x_vec)
|
||||
|
@ -119,7 +119,7 @@ L(last_2x_vec):
|
||||
# endif
|
||||
jle L(zero_2)
|
||||
|
||||
/* We adjusted rax (length) for VEC_SIZE == 64 so need seperate
|
||||
/* We adjusted rax (length) for VEC_SIZE == 64 so need separate
|
||||
offsets. */
|
||||
# if VEC_SIZE == 64
|
||||
vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0
|
||||
@ -354,7 +354,7 @@ L(loop_4x_vec):
|
||||
jnz L(first_vec_x1_end)
|
||||
KMOV %k2, %VRCX
|
||||
|
||||
/* Seperate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
|
||||
/* Separate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
|
||||
returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
|
||||
individually, for VEC_SIZE == 32 we combine them in a single
|
||||
64-bit GPR. */
|
||||
|
@ -50,7 +50,7 @@ ENTRY_P2ALIGN(MEMRCHR, 6)
|
||||
jz L(page_cross)
|
||||
|
||||
/* NB: This load happens regardless of whether rdx (len) is zero. Since
|
||||
it doesn't cross a page and the standard gurantees any pointer have
|
||||
it doesn't cross a page and the standard guarantees any pointer have
|
||||
at least one-valid byte this load must be safe. For the entire
|
||||
history of the x86 memrchr implementation this has been possible so
|
||||
no code "should" be relying on a zero-length check before this load.
|
||||
|
@ -199,7 +199,7 @@ L(less_vec_from_wmemset):
|
||||
MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out. */
|
||||
andl $(PAGE_SIZE - 1), %edi
|
||||
/* Check if VEC_SIZE store cross page. Mask stores suffer
|
||||
serious performance degradation when it has to fault supress.
|
||||
serious performance degradation when it has to fault suppress.
|
||||
*/
|
||||
cmpl $(PAGE_SIZE - VEC_SIZE), %edi
|
||||
/* This is generally considered a cold target. */
|
||||
|
@ -187,13 +187,13 @@ L(loop_4x_vec):
|
||||
ymm0-15 is used at all is because there is no EVEX encoding
|
||||
vpcmpeq and with vpcmpeq this loop can be performed more
|
||||
efficiently. The non-vzeroupper version is safe for RTM
|
||||
while the vzeroupper version should be prefered if RTM are
|
||||
while the vzeroupper version should be preferred if RTM are
|
||||
not supported. Which loop version we use is determined by
|
||||
USE_TERN_IN_LOOP. */
|
||||
|
||||
# if USE_TERN_IN_LOOP
|
||||
/* Since vptern can only take 3x vectors fastest to do 1 vec
|
||||
seperately with EVEX vpcmp. */
|
||||
separately with EVEX vpcmp. */
|
||||
VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k1
|
||||
/* Compare 3x with vpcmpeq and or them all together with vptern.
|
||||
*/
|
||||
@ -256,7 +256,7 @@ L(loop_4x_vec):
|
||||
(only if used VEX encoded loop). */
|
||||
COND_VZEROUPPER
|
||||
|
||||
/* Seperate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
|
||||
/* Separate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
|
||||
returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
|
||||
individually, for VEC_SIZE == 32 we combine them in a single
|
||||
64-bit GPR. */
|
||||
|
@ -163,7 +163,7 @@ ENTRY (STRCAT)
|
||||
decl %ecx
|
||||
jnz 21b
|
||||
|
||||
/* Now the sources is aligned. Unfortunatly we cannot force
|
||||
/* Now the sources is aligned. Unfortunately we cannot force
|
||||
to have both source and destination aligned, so ignore the
|
||||
alignment of the destination. */
|
||||
.p2align 4
|
||||
|
@ -1,4 +1,4 @@
|
||||
/* strlen used for begining of str{n}cat using AVX2.
|
||||
/* strlen used for beginning of str{n}cat using AVX2.
|
||||
Copyright (C) 2011-2023 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
/* strlen used for begining of str{n}cat using EVEX 256/512.
|
||||
/* strlen used for beginning of str{n}cat using EVEX 256/512.
|
||||
Copyright (C) 2011-2023 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
|
@ -160,7 +160,7 @@ L(last_vec_x2):
|
||||
# endif
|
||||
L(first_vec_x1):
|
||||
/* Use bsf here to save 1-byte keeping keeping the block in 1x
|
||||
fetch block. eax guranteed non-zero. */
|
||||
fetch block. eax guaranteed non-zero. */
|
||||
bsf %VRCX, %VRCX
|
||||
# ifndef USE_AS_STRCHRNUL
|
||||
/* Found CHAR or the null byte. */
|
||||
@ -294,7 +294,7 @@ L(loop_4x_vec):
|
||||
|
||||
/* Two methods for loop depending on VEC_SIZE. This is because
|
||||
with zmm registers VPMINU can only run on p0 (as opposed to
|
||||
p0/p1 for ymm) so it is less prefered. */
|
||||
p0/p1 for ymm) so it is less preferred. */
|
||||
# if VEC_SIZE == 32
|
||||
/* For VEC_2 and VEC_3 use xor to set the CHARs matching esi to
|
||||
zero. */
|
||||
@ -340,7 +340,7 @@ L(loop_4x_vec):
|
||||
esi, the corresponding bit in %k3 is zero so the
|
||||
VPMINU_MASKZ will have a zero in the result). NB: This make
|
||||
the VPMINU 3c latency. The only way to avoid it is to
|
||||
createa a 12c dependency chain on all the `VPCMP $4, ...`
|
||||
create a 12c dependency chain on all the `VPCMP $4, ...`
|
||||
which has higher total latency. */
|
||||
VPMINU %VMM(2), %VMM(4), %VMM(4){%k3}{z}
|
||||
# endif
|
||||
@ -366,7 +366,7 @@ L(loop_4x_vec):
|
||||
# endif
|
||||
|
||||
|
||||
/* COND_MASK integates the esi matches for VEC_SIZE == 64. For
|
||||
/* COND_MASK integrates the esi matches for VEC_SIZE == 64. For
|
||||
VEC_SIZE == 32 they are already integrated. */
|
||||
VPTEST %VMM(2), %VMM(2), %k0 COND_MASK(k2)
|
||||
KMOV %k0, %VRCX
|
||||
@ -403,7 +403,7 @@ L(zero_end):
|
||||
# endif
|
||||
|
||||
|
||||
/* Seperate return label for last VEC1 because for VEC_SIZE ==
|
||||
/* Separate return label for last VEC1 because for VEC_SIZE ==
|
||||
32 we can reuse return code in L(page_cross) but VEC_SIZE ==
|
||||
64 has mismatched registers. */
|
||||
# if VEC_SIZE == 64
|
||||
@ -480,7 +480,7 @@ L(cross_page_boundary_real):
|
||||
*/
|
||||
xorl $((1 << CHAR_PER_VEC)- 1), %eax
|
||||
# endif
|
||||
/* Use arithmatic shift so that leading 1s are filled in. */
|
||||
/* Use arithmetic shift so that leading 1s are filled in. */
|
||||
sarx %VGPR(SHIFT_REG), %VRAX, %VRAX
|
||||
/* If eax is all ones then no matches for esi or NULL. */
|
||||
|
||||
|
@ -86,7 +86,7 @@ L(next_48_bytes):
|
||||
jne L(return)
|
||||
L(loop_start):
|
||||
/* We use this alignment to force loop be aligned to 8 but not
|
||||
16 bytes. This gives better sheduling on AMD processors. */
|
||||
16 bytes. This gives better scheduling on AMD processors. */
|
||||
.p2align 4
|
||||
pxor %xmm6, %xmm6
|
||||
andq $-64, %rdi
|
||||
|
@ -194,7 +194,7 @@ ENTRY (STRCASECMP)
|
||||
movq __libc_tsd_LOCALE@gottpoff(%rip), %rax
|
||||
mov %fs:(%rax), %LOCALE_REG_LP
|
||||
|
||||
/* Either 1 or 5 bytes (dependeing if CET is enabled). */
|
||||
/* Either 1 or 5 bytes (depending if CET is enabled). */
|
||||
.p2align 4
|
||||
END (STRCASECMP)
|
||||
/* FALLTHROUGH to strcasecmp/strncasecmp_l. */
|
||||
@ -501,7 +501,7 @@ L(more_3x_vec):
|
||||
L(prepare_loop):
|
||||
|
||||
# ifdef USE_AS_STRNCMP
|
||||
/* Store N + (VEC_SIZE * 4) and place check at the begining of
|
||||
/* Store N + (VEC_SIZE * 4) and place check at the beginning of
|
||||
the loop. */
|
||||
leaq (VEC_SIZE * 2)(%rdi, %rdx), %rdx
|
||||
# endif
|
||||
@ -762,7 +762,7 @@ L(page_cross_during_loop):
|
||||
.p2align 4,, 4
|
||||
L(less_1x_vec_till_page_cross):
|
||||
subl $-(VEC_SIZE * 4), %eax
|
||||
/* Guranteed safe to read from rdi - VEC_SIZE here. The only
|
||||
/* Guaranteed safe to read from rdi - VEC_SIZE here. The only
|
||||
concerning case is first iteration if incoming s1 was near start
|
||||
of a page and s2 near end. If s1 was near the start of the page
|
||||
we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe
|
||||
@ -948,7 +948,7 @@ L(ret9):
|
||||
L(page_cross):
|
||||
# ifndef USE_AS_STRNCMP
|
||||
/* If both are VEC aligned we don't need any special logic here.
|
||||
Only valid for strcmp where stop condition is guranteed to be
|
||||
Only valid for strcmp where stop condition is guaranteed to be
|
||||
reachable by just reading memory. */
|
||||
testl $((VEC_SIZE - 1) << 20), %eax
|
||||
jz L(no_page_cross)
|
||||
@ -984,7 +984,7 @@ L(page_cross):
|
||||
subl $(VEC_SIZE * 3), %eax
|
||||
jg L(less_1x_vec_till_page)
|
||||
|
||||
/* If more than 1x VEC till page cross, loop throuh safely
|
||||
/* If more than 1x VEC till page cross, loop through safely
|
||||
loadable memory until within 1x VEC of page cross. */
|
||||
|
||||
.p2align 4,, 10
|
||||
@ -1007,9 +1007,9 @@ L(page_cross_loop):
|
||||
jl L(page_cross_loop)
|
||||
|
||||
subl %eax, %OFFSET_REG
|
||||
/* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
|
||||
/* OFFSET_REG has distance to page cross - VEC_SIZE. Guaranteed
|
||||
to not cross page so is safe to load. Since we have already
|
||||
loaded at least 1 VEC from rsi it is also guranteed to be
|
||||
loaded at least 1 VEC from rsi it is also guaranteed to be
|
||||
safe. */
|
||||
|
||||
VMOVU (%rdi, %OFFSET_REG64), %ymm0
|
||||
|
@ -217,7 +217,7 @@ ENTRY (STRCASECMP)
|
||||
movq __libc_tsd_LOCALE@gottpoff(%rip), %rax
|
||||
mov %fs:(%rax), %LOCALE_REG_LP
|
||||
|
||||
/* Either 1 or 5 bytes (dependeing if CET is enabled). */
|
||||
/* Either 1 or 5 bytes (depending if CET is enabled). */
|
||||
.p2align 4
|
||||
END (STRCASECMP)
|
||||
/* FALLTHROUGH to strcasecmp/strncasecmp_l. */
|
||||
@ -455,7 +455,7 @@ L(return_vec_3):
|
||||
# endif
|
||||
|
||||
/* If CHAR_PER_VEC == 64 we can't combine matches from the last
|
||||
2x VEC so need seperate return label. */
|
||||
2x VEC so need separate return label. */
|
||||
L(return_vec_2):
|
||||
# if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
|
||||
bsf %VRCX, %VRCX
|
||||
@ -567,7 +567,7 @@ L(prepare_loop_no_len):
|
||||
shrl $2, %ecx
|
||||
leaq (CHAR_PER_VEC * 2)(%rdx, %rcx), %rdx
|
||||
# else
|
||||
/* Store N + (VEC_SIZE * 4) and place check at the begining of
|
||||
/* Store N + (VEC_SIZE * 4) and place check at the beginning of
|
||||
the loop. */
|
||||
leaq (VEC_SIZE * 2)(%rdi, %rdx), %rdx
|
||||
L(prepare_loop_no_len):
|
||||
@ -840,7 +840,7 @@ L(ret7):
|
||||
|
||||
|
||||
/* If CHAR_PER_VEC == 64 we can't combine matches from the last
|
||||
2x VEC so need seperate return label. */
|
||||
2x VEC so need separate return label. */
|
||||
# if CHAR_PER_VEC == 64
|
||||
L(return_vec_2_end):
|
||||
bsf %VRCX, %VRCX
|
||||
@ -906,7 +906,7 @@ L(page_cross_during_loop):
|
||||
.p2align 4,, 4
|
||||
L(less_1x_vec_till_page_cross):
|
||||
subl $-(VEC_SIZE * 4), %eax
|
||||
/* Guranteed safe to read from rdi - VEC_SIZE here. The only
|
||||
/* Guaranteed safe to read from rdi - VEC_SIZE here. The only
|
||||
concerning case is first iteration if incoming s1 was near start
|
||||
of a page and s2 near end. If s1 was near the start of the page
|
||||
we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe
|
||||
@ -997,7 +997,7 @@ L(return_page_cross_end_check):
|
||||
and %VR10, %VRCX
|
||||
/* Need to use tzcnt here as VRCX may be zero. If VRCX is zero
|
||||
tzcnt(VRCX) will be CHAR_PER and remaining length (edx) is
|
||||
guranteed to be <= CHAR_PER_VEC so we will only use the return
|
||||
guaranteed to be <= CHAR_PER_VEC so we will only use the return
|
||||
idx if VRCX was non-zero. */
|
||||
tzcnt %VRCX, %VRCX
|
||||
leal -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
|
||||
@ -1147,7 +1147,7 @@ L(ret9):
|
||||
L(page_cross):
|
||||
# ifndef USE_AS_STRNCMP
|
||||
/* If both are VEC aligned we don't need any special logic here.
|
||||
Only valid for strcmp where stop condition is guranteed to
|
||||
Only valid for strcmp where stop condition is guaranteed to
|
||||
be reachable by just reading memory. */
|
||||
testl $((VEC_SIZE - 1) << 20), %eax
|
||||
jz L(no_page_cross)
|
||||
@ -1185,7 +1185,7 @@ L(page_cross):
|
||||
jg L(less_1x_vec_till_page)
|
||||
|
||||
|
||||
/* If more than 1x VEC till page cross, loop throuh safely
|
||||
/* If more than 1x VEC till page cross, loop through safely
|
||||
loadable memory until within 1x VEC of page cross. */
|
||||
.p2align 4,, 8
|
||||
L(page_cross_loop):
|
||||
@ -1209,9 +1209,9 @@ L(page_cross_loop):
|
||||
|
||||
|
||||
subl %eax, %OFFSET_REG
|
||||
/* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
|
||||
/* OFFSET_REG has distance to page cross - VEC_SIZE. Guaranteed
|
||||
to not cross page so is safe to load. Since we have already
|
||||
loaded at least 1 VEC from rsi it is also guranteed to be
|
||||
loaded at least 1 VEC from rsi it is also guaranteed to be
|
||||
safe. */
|
||||
VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(0)
|
||||
VPTESTM %VMM(0), %VMM(0), %k2
|
||||
|
@ -20,7 +20,7 @@
|
||||
|
||||
/* Continue building as ISA level 2. We use this as ISA V2 default
|
||||
because strcmp-sse42 uses pcmpstri (slow on some SSE4.2
|
||||
processors) and this implementation is potenially faster than
|
||||
processors) and this implementation is potentially faster than
|
||||
strcmp-sse42 (aside from the slower page cross case). */
|
||||
#if ISA_SHOULD_BUILD (2)
|
||||
|
||||
|
@ -75,7 +75,7 @@ ENTRY2 (STRCASECMP)
|
||||
movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
|
||||
mov %fs:(%rax),%RDX_LP
|
||||
|
||||
/* Either 1 or 5 bytes (dependeing if CET is enabled). */
|
||||
/* Either 1 or 5 bytes (depending if CET is enabled). */
|
||||
.p2align 4
|
||||
END2 (STRCASECMP)
|
||||
/* FALLTHROUGH to strcasecmp_l. */
|
||||
@ -89,7 +89,7 @@ ENTRY2 (STRCASECMP)
|
||||
movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
|
||||
mov %fs:(%rax),%RCX_LP
|
||||
|
||||
/* Either 1 or 5 bytes (dependeing if CET is enabled). */
|
||||
/* Either 1 or 5 bytes (depending if CET is enabled). */
|
||||
.p2align 4
|
||||
END2 (STRCASECMP)
|
||||
/* FALLTHROUGH to strncasecmp_l. */
|
||||
@ -186,7 +186,7 @@ ENTRY (STRCMP)
|
||||
jnz LABEL(less16bytes) /* If not, find different value or null char */
|
||||
# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
|
||||
sub $16, %r11
|
||||
jbe LABEL(strcmp_exitz) /* finish comparision */
|
||||
jbe LABEL(strcmp_exitz) /* finish comparison */
|
||||
# endif
|
||||
add $16, %rsi /* prepare to search next 16 bytes */
|
||||
add $16, %rdi /* prepare to search next 16 bytes */
|
||||
@ -400,7 +400,7 @@ LABEL(nibble_ashr_1):
|
||||
# endif
|
||||
|
||||
pxor %xmm0, %xmm0
|
||||
sub $0x1000, %r10 /* substract 4K from %r10 */
|
||||
sub $0x1000, %r10 /* subtract 4K from %r10 */
|
||||
jmp LABEL(gobble_ashr_1)
|
||||
|
||||
/*
|
||||
|
@ -84,7 +84,7 @@ ENTRY (STRCASECMP)
|
||||
movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
|
||||
mov %fs:(%rax),%RDX_LP
|
||||
|
||||
/* Either 1 or 5 bytes (dependeing if CET is enabled). */
|
||||
/* Either 1 or 5 bytes (depending if CET is enabled). */
|
||||
.p2align 4
|
||||
END (STRCASECMP)
|
||||
/* FALLTHROUGH to strcasecmp_l. */
|
||||
@ -94,7 +94,7 @@ ENTRY (STRCASECMP)
|
||||
movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
|
||||
mov %fs:(%rax),%RCX_LP
|
||||
|
||||
/* Either 1 or 5 bytes (dependeing if CET is enabled). */
|
||||
/* Either 1 or 5 bytes (depending if CET is enabled). */
|
||||
.p2align 4
|
||||
END (STRCASECMP)
|
||||
/* FALLTHROUGH to strncasecmp_l. */
|
||||
|
@ -50,7 +50,7 @@ ENTRY (STRCPY)
|
||||
5:
|
||||
movq $0xfefefefefefefeff,%r8
|
||||
|
||||
/* Now the sources is aligned. Unfortunatly we cannot force
|
||||
/* Now the sources is aligned. Unfortunately we cannot force
|
||||
to have both source and destination aligned, so ignore the
|
||||
alignment of the destination. */
|
||||
.p2align 4
|
||||
|
@ -224,7 +224,7 @@ L(cross_page_continue):
|
||||
since data is only aligned to VEC_SIZE. */
|
||||
# ifdef USE_AS_STRNLEN
|
||||
/* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
|
||||
because it simplies the logic in last_4x_vec_or_less. */
|
||||
because it simplifies the logic in last_4x_vec_or_less. */
|
||||
leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
|
||||
subq %rdx, %rcx
|
||||
# ifdef USE_AS_WCSLEN
|
||||
|
@ -236,7 +236,7 @@ L(more_1x_vec):
|
||||
VMOVU %VMM(0), (%rdi)
|
||||
|
||||
/* We are going to align rsi here so will need to be able to re-
|
||||
adjust rdi/rdx afterwords. NB: We filtered out huge lengths
|
||||
adjust rdi/rdx afterwards. NB: We filtered out huge lengths
|
||||
so rsi + rdx * CHAR_SIZE cannot overflow. */
|
||||
|
||||
leaq (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
|
||||
|
@ -99,7 +99,7 @@ L(page_cross_continue):
|
||||
/* `jb` because length rdx is now length - CHAR_SIZE. */
|
||||
jbe L(less_1x_vec)
|
||||
|
||||
/* This may overset but thats fine because we still need to zero
|
||||
/* This may overset but that's fine because we still need to zero
|
||||
fill. */
|
||||
VMOVU %VMM(0), (%rdi)
|
||||
|
||||
|
@ -130,7 +130,7 @@ L(page_cross_continue):
|
||||
jae L(more_1x_vec)
|
||||
|
||||
/* If there where multiple zero-CHAR matches in the first VEC,
|
||||
VRCX will be overset but thats fine since any oversets where
|
||||
VRCX will be overset but that's fine since any oversets where
|
||||
at zero-positions anyways. */
|
||||
|
||||
# ifdef USE_AS_STPCPY
|
||||
@ -177,7 +177,7 @@ L(more_1x_vec):
|
||||
# endif
|
||||
|
||||
|
||||
/* This may overset but thats fine because we still need to zero
|
||||
/* This may overset but that's fine because we still need to zero
|
||||
fill. */
|
||||
VMOVU %VMM(0), (%rdi)
|
||||
|
||||
@ -189,7 +189,7 @@ L(more_1x_vec):
|
||||
|
||||
|
||||
/* We are going to align rsi here so will need to be able to re-
|
||||
adjust rdi/rdx afterwords. NB: We filtered out huge lengths
|
||||
adjust rdi/rdx afterwards. NB: We filtered out huge lengths
|
||||
so rsi + rdx * CHAR_SIZE cannot overflow. */
|
||||
leaq (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
|
||||
subq %rsi, %rdi
|
||||
@ -221,7 +221,7 @@ L(last_2x_vec):
|
||||
cmpl $(CHAR_PER_VEC), %edx
|
||||
jb L(ret_vec_x1_len)
|
||||
|
||||
/* Seperate logic for CHAR_PER_VEC == 64 because we already did
|
||||
/* Separate logic for CHAR_PER_VEC == 64 because we already did
|
||||
`tzcnt` on VRCX. */
|
||||
# if CHAR_PER_VEC == 64
|
||||
/* cl == CHAR_PER_VEC iff it was zero before the `tzcnt`. */
|
||||
@ -296,7 +296,7 @@ L(ret_vec_x1_no_bsf):
|
||||
|
||||
.p2align 4,, 8
|
||||
L(last_4x_vec):
|
||||
/* Seperate logic for CHAR_PER_VEC == 64 because we can do `andl
|
||||
/* Separate logic for CHAR_PER_VEC == 64 because we can do `andl
|
||||
$(CHAR_PER_VEC * 4 - 1), %edx` with less code size just
|
||||
using `movzbl`. */
|
||||
# if CHAR_PER_VEC == 64
|
||||
@ -677,7 +677,7 @@ L(copy_16_31):
|
||||
vmovdqu %xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
|
||||
cmpl %ecx, %edx
|
||||
|
||||
/* Seperate logic depending on VEC_SIZE. If VEC_SIZE == 64 then
|
||||
/* Separate logic depending on VEC_SIZE. If VEC_SIZE == 64 then
|
||||
we have a larger copy block for 32-63 so this is just falls
|
||||
through to zfill 16-31. If VEC_SIZE == 32 then we check for
|
||||
full zfill of less 1x VEC. */
|
||||
|
@ -336,7 +336,7 @@ L(loop_last_4x_vec):
|
||||
|
||||
VPTESTN %VMM(3), %VMM(3), %k0
|
||||
|
||||
/* Seperate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
|
||||
/* Separate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
|
||||
returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
|
||||
individually, for VEC_SIZE == 32 we combine them in a single
|
||||
64-bit GPR. */
|
||||
|
@ -176,7 +176,7 @@ L(aligned_more):
|
||||
.p2align 4
|
||||
L(first_aligned_loop):
|
||||
/* Do 2x VEC at a time. Any more and the cost of finding the
|
||||
match outweights loop benefit. */
|
||||
match outweighs loop benefit. */
|
||||
vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4
|
||||
vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5
|
||||
|
||||
@ -324,7 +324,7 @@ L(cross_page):
|
||||
vmovdqu (%rsi), %ymm1
|
||||
VPCMPEQ %ymm1, %ymm0, %ymm6
|
||||
vpmovmskb %ymm6, %ecx
|
||||
/* Shift out zero CHAR matches that are before the begining of
|
||||
/* Shift out zero CHAR matches that are before the beginning of
|
||||
src (rdi). */
|
||||
shrxl %edi, %ecx, %ecx
|
||||
testl %ecx, %ecx
|
||||
@ -332,7 +332,7 @@ L(cross_page):
|
||||
VPCMPEQ %ymm1, %ymm7, %ymm1
|
||||
vpmovmskb %ymm1, %eax
|
||||
|
||||
/* Shift out search CHAR matches that are before the begining of
|
||||
/* Shift out search CHAR matches that are before the beginning of
|
||||
src (rdi). */
|
||||
shrxl %edi, %eax, %eax
|
||||
blsmskl %ecx, %ecx
|
||||
|
@ -152,7 +152,7 @@ L(loop):
|
||||
jnz L(loop_vec_x2_match)
|
||||
|
||||
KMOV %k1, %VRDX
|
||||
/* Match is in first vector, rdi offset need to be substracted
|
||||
/* Match is in first vector, rdi offset need to be subtracted
|
||||
by VEC_SIZE. */
|
||||
sub $VEC_SIZE, %r8
|
||||
|
||||
@ -216,7 +216,7 @@ L(check_last_match):
|
||||
ret
|
||||
|
||||
/* No match recorded in r8. Check the second saved vector
|
||||
in begining. */
|
||||
in beginning. */
|
||||
L(vector_x2_ret):
|
||||
VPCMPEQ %VMM(2), %VMM(0), %k2
|
||||
KMOV %k2, %VRAX
|
||||
|
@ -139,7 +139,7 @@ L(first_vec_x1_or_x2):
|
||||
KORTEST %k2, %k3
|
||||
jz L(first_vec_x0_test)
|
||||
|
||||
/* Guranteed that VEC(2) and VEC(3) are within range so merge
|
||||
/* Guaranteed that VEC(2) and VEC(3) are within range so merge
|
||||
the two bitmasks then get last result. */
|
||||
kunpck_2x %k2, %k3, %k3
|
||||
kmov_2x %k3, %maskm_2x
|
||||
@ -192,7 +192,7 @@ L(first_vec_x2):
|
||||
|
||||
.p2align 4,, 12
|
||||
L(aligned_more):
|
||||
/* Need to keep original pointer incase VEC(1) has last match.
|
||||
/* Need to keep original pointer in case VEC(1) has last match.
|
||||
*/
|
||||
movq %rdi, %rsi
|
||||
andq $-VEC_SIZE, %rdi
|
||||
@ -222,7 +222,7 @@ L(aligned_more):
|
||||
.p2align 4,, 10
|
||||
L(first_aligned_loop):
|
||||
/* Preserve VEC(1), VEC(2), VEC(3), and VEC(4) until we can
|
||||
gurantee they don't store a match. */
|
||||
guarantee they don't store a match. */
|
||||
VMOVA (VEC_SIZE * 4)(%rdi), %VMM(5)
|
||||
VMOVA (VEC_SIZE * 5)(%rdi), %VMM(6)
|
||||
|
||||
@ -285,7 +285,7 @@ L(second_aligned_loop_prep):
|
||||
L(second_aligned_loop_set_furthest_match):
|
||||
movq %rdi, %rsi
|
||||
/* Ideally we would safe k2/k3 but `kmov/kunpck` take uops on
|
||||
port0 and have noticable overhead in the loop. */
|
||||
port0 and have noticeable overhead in the loop. */
|
||||
VMOVA %VMM(5), %VMM(7)
|
||||
VMOVA %VMM(6), %VMM(8)
|
||||
.p2align 4
|
||||
@ -351,7 +351,7 @@ L(cross_page_boundary):
|
||||
/* eax contains all the page offset bits of src (rdi). `xor rdi,
|
||||
rax` sets pointer will all page offset bits cleared so
|
||||
offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
|
||||
before page cross (guranteed to be safe to read). Doing this
|
||||
before page cross (guaranteed to be safe to read). Doing this
|
||||
as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
|
||||
a bit of code size. */
|
||||
xorq %rdi, %rax
|
||||
@ -359,7 +359,7 @@ L(cross_page_boundary):
|
||||
VPTESTN %VMM(1), %VMM(1), %k0
|
||||
KMOV %k0, %VRCX
|
||||
|
||||
/* Shift out zero CHAR matches that are before the begining of
|
||||
/* Shift out zero CHAR matches that are before the beginning of
|
||||
src (rdi). */
|
||||
# ifdef USE_AS_WCSRCHR
|
||||
movl %edi, %esi
|
||||
@ -374,7 +374,7 @@ L(cross_page_boundary):
|
||||
/* Found zero CHAR so need to test for search CHAR. */
|
||||
VPCMP $0, %VMATCH, %VMM(1), %k1
|
||||
KMOV %k1, %VRAX
|
||||
/* Shift out search CHAR matches that are before the begining of
|
||||
/* Shift out search CHAR matches that are before the beginning of
|
||||
src (rdi). */
|
||||
shrx %VGPR(SHIFT_REG), %VRAX, %VRAX
|
||||
|
||||
|
@ -166,7 +166,7 @@ L(first_loop):
|
||||
/* Do 2x VEC at a time. */
|
||||
movaps (VEC_SIZE * 2)(%rdi), %xmm4
|
||||
movaps (VEC_SIZE * 3)(%rdi), %xmm5
|
||||
/* Since SSE2 no pminud so wcsrchr needs seperate logic for
|
||||
/* Since SSE2 no pminud so wcsrchr needs separate logic for
|
||||
detecting zero. Note if this is found to be a bottleneck it
|
||||
may be worth adding an SSE4.1 wcsrchr implementation. */
|
||||
# ifdef USE_AS_WCSRCHR
|
||||
@ -238,7 +238,7 @@ L(new_match):
|
||||
|
||||
/* We can't reuse either of the old comparisons as since we mask
|
||||
of zeros after first zero (instead of using the full
|
||||
comparison) we can't gurantee no interference between match
|
||||
comparison) we can't guarantee no interference between match
|
||||
after end of string and valid match. */
|
||||
pmovmskb %xmm4, %eax
|
||||
pmovmskb %xmm7, %edx
|
||||
@ -268,7 +268,7 @@ L(second_loop_match):
|
||||
L(second_loop):
|
||||
movaps (VEC_SIZE * 2)(%rdi), %xmm4
|
||||
movaps (VEC_SIZE * 3)(%rdi), %xmm5
|
||||
/* Since SSE2 no pminud so wcsrchr needs seperate logic for
|
||||
/* Since SSE2 no pminud so wcsrchr needs separate logic for
|
||||
detecting zero. Note if this is found to be a bottleneck it
|
||||
may be worth adding an SSE4.1 wcsrchr implementation. */
|
||||
# ifdef USE_AS_WCSRCHR
|
||||
@ -297,11 +297,11 @@ L(second_loop):
|
||||
pmovmskb %xmm6, %eax
|
||||
|
||||
addq $(VEC_SIZE * 2), %rdi
|
||||
/* Either null term or new occurence of CHAR. */
|
||||
/* Either null term or new occurrence of CHAR. */
|
||||
addl %ecx, %eax
|
||||
jz L(second_loop)
|
||||
|
||||
/* No null term so much be new occurence of CHAR. */
|
||||
/* No null term so much be new occurrence of CHAR. */
|
||||
testl %ecx, %ecx
|
||||
jz L(second_loop_match)
|
||||
|
||||
@ -331,7 +331,7 @@ L(second_loop_new_match):
|
||||
|
||||
/* We can't reuse either of the old comparisons as since we mask
|
||||
of zeros after first zero (instead of using the full
|
||||
comparison) we can't gurantee no interference between match
|
||||
comparison) we can't guarantee no interference between match
|
||||
after end of string and valid match. */
|
||||
pmovmskb %xmm4, %eax
|
||||
pmovmskb %xmm7, %edx
|
||||
|
@ -140,7 +140,7 @@ __strstr_avx512 (const char *haystack, const char *ned)
|
||||
= cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask, hay0, hay0));
|
||||
uint64_t cmpmask = nullmask ^ (nullmask - ONE_64BIT);
|
||||
cmpmask = cmpmask & cvtmask64_u64 (loadmask);
|
||||
/* Search for the 2 charaters of needle */
|
||||
/* Search for the 2 characters of needle */
|
||||
__mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
|
||||
__mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
|
||||
k1 = kshiftri_mask64 (k1, 1);
|
||||
|
Loading…
Reference in New Issue
Block a user