Fix misspellings in sysdeps/x86_64 -- BZ 25337.

Applying this commit results in bit-identical rebuild of libc.so.6
math/libm.so.6 elf/ld-linux-x86-64.so.2 mathvec/libmvec.so.1

Reviewed-by: Florian Weimer <fweimer@redhat.com>
This commit is contained in:
Paul Pluzhnikov 2023-05-23 03:57:01 +00:00
parent ec9a66cd01
commit 1e9d5987fd
37 changed files with 105 additions and 105 deletions

View File

@ -28,11 +28,11 @@
# undef BASE # undef BASE
# if (STATE_SAVE_ALIGNMENT % 16) != 0 # if (STATE_SAVE_ALIGNMENT % 16) != 0
# error STATE_SAVE_ALIGNMENT must be multples of 16 # error STATE_SAVE_ALIGNMENT must be multiple of 16
# endif # endif
# if (STATE_SAVE_OFFSET % STATE_SAVE_ALIGNMENT) != 0 # if (STATE_SAVE_OFFSET % STATE_SAVE_ALIGNMENT) != 0
# error STATE_SAVE_OFFSET must be multples of STATE_SAVE_ALIGNMENT # error STATE_SAVE_OFFSET must be multiple of STATE_SAVE_ALIGNMENT
# endif # endif
# if DL_RUNTIME_RESOLVE_REALIGN_STACK # if DL_RUNTIME_RESOLVE_REALIGN_STACK
@ -43,7 +43,7 @@
/* Use fxsave to save XMM registers. */ /* Use fxsave to save XMM registers. */
# define REGISTER_SAVE_AREA (512 + STATE_SAVE_OFFSET) # define REGISTER_SAVE_AREA (512 + STATE_SAVE_OFFSET)
# if (REGISTER_SAVE_AREA % 16) != 0 # if (REGISTER_SAVE_AREA % 16) != 0
# error REGISTER_SAVE_AREA must be multples of 16 # error REGISTER_SAVE_AREA must be multiple of 16
# endif # endif
# endif # endif
# else # else
@ -57,7 +57,7 @@
# define LOCAL_STORAGE_AREA REGISTER_SAVE_AREA # define LOCAL_STORAGE_AREA REGISTER_SAVE_AREA
# define BASE rsp # define BASE rsp
# if (REGISTER_SAVE_AREA % 16) != 8 # if (REGISTER_SAVE_AREA % 16) != 8
# error REGISTER_SAVE_AREA must be odd multples of 8 # error REGISTER_SAVE_AREA must be odd multiple of 8
# endif # endif
# endif # endif
@ -161,7 +161,7 @@ _dl_runtime_resolve:
#if !defined PROF && defined _dl_runtime_profile #if !defined PROF && defined _dl_runtime_profile
# if (LR_VECTOR_OFFSET % VEC_SIZE) != 0 # if (LR_VECTOR_OFFSET % VEC_SIZE) != 0
# error LR_VECTOR_OFFSET must be multples of VEC_SIZE # error LR_VECTOR_OFFSET must be multiple of VEC_SIZE
# endif # endif
.globl _dl_runtime_profile .globl _dl_runtime_profile
@ -173,7 +173,7 @@ _dl_runtime_profile:
cfi_adjust_cfa_offset(16) # Incorporate PLT cfi_adjust_cfa_offset(16) # Incorporate PLT
_CET_ENDBR _CET_ENDBR
/* The La_x86_64_regs data structure pointed to by the /* The La_x86_64_regs data structure pointed to by the
fourth paramater must be VEC_SIZE-byte aligned. This must fourth parameter must be VEC_SIZE-byte aligned. This must
be explicitly enforced. We have the set up a dynamically be explicitly enforced. We have the set up a dynamically
sized stack frame. %rbx points to the top half which sized stack frame. %rbx points to the top half which
has a fixed size and preserves the original stack pointer. */ has a fixed size and preserves the original stack pointer. */

View File

@ -31,7 +31,7 @@ __feupdateenv (const fenv_t *envp)
/* Install new environment. */ /* Install new environment. */
__fesetenv (envp); __fesetenv (envp);
/* Raise the saved exception. Incidently for us the implementation /* Raise the saved exception. Incidentally for us the implementation
defined format of the values in objects of type fexcept_t is the defined format of the values in objects of type fexcept_t is the
same as the ones specified using the FE_* constants. */ same as the ones specified using the FE_* constants. */
__feraiseexcept ((int) temp); __feraiseexcept ((int) temp);

View File

@ -1,4 +1,4 @@
/* Common definition for strcasecmp famly ifunc selections. /* Common definition for strcasecmp family ifunc selections.
All versions must be listed in ifunc-impl-list.c. All versions must be listed in ifunc-impl-list.c.
Copyright (C) 2017-2023 Free Software Foundation, Inc. Copyright (C) 2017-2023 Free Software Foundation, Inc.
This file is part of the GNU C Library. This file is part of the GNU C Library.

View File

@ -440,13 +440,13 @@ L(loop_4x_vec):
ymm0-15 is used at all is because there is no EVEX encoding ymm0-15 is used at all is because there is no EVEX encoding
vpcmpeq and with vpcmpeq this loop can be performed more vpcmpeq and with vpcmpeq this loop can be performed more
efficiently. The non-vzeroupper version is safe for RTM efficiently. The non-vzeroupper version is safe for RTM
while the vzeroupper version should be prefered if RTM are while the vzeroupper version should be preferred if RTM are
not supported. Which loop version we use is determined by not supported. Which loop version we use is determined by
USE_TERN_IN_LOOP. */ USE_TERN_IN_LOOP. */
# if USE_TERN_IN_LOOP # if USE_TERN_IN_LOOP
/* Since vptern can only take 3x vectors fastest to do 1 vec /* Since vptern can only take 3x vectors fastest to do 1 vec
seperately with EVEX vpcmp. */ separately with EVEX vpcmp. */
# ifdef USE_AS_WMEMCHR # ifdef USE_AS_WMEMCHR
/* vptern can only accept masks for epi32/epi64 so can only save /* vptern can only accept masks for epi32/epi64 so can only save
instruction using not equals mask on vptern with wmemchr. instruction using not equals mask on vptern with wmemchr.
@ -539,7 +539,7 @@ L(last_vec_x1_novzero):
# if CHAR_PER_VEC == 64 # if CHAR_PER_VEC == 64
/* Since we can't combine the last 2x VEC when CHAR_PER_VEC == /* Since we can't combine the last 2x VEC when CHAR_PER_VEC ==
64 it needs a seperate return label. */ 64 it needs a separate return label. */
.p2align 4,, 4 .p2align 4,, 4
L(last_vec_x2): L(last_vec_x2):
L(last_vec_x2_novzero): L(last_vec_x2_novzero):
@ -579,8 +579,8 @@ L(loop_vec_ret):
(only if used VEX encoded loop). */ (only if used VEX encoded loop). */
COND_VZEROUPPER COND_VZEROUPPER
/* Seperate logic for CHAR_PER_VEC == 64 vs the rest. For /* Separate logic for CHAR_PER_VEC == 64 vs the rest. For
CHAR_PER_VEC we test the last 2x VEC seperately, for CHAR_PER_VEC we test the last 2x VEC separately, for
CHAR_PER_VEC <= 32 we can combine the results from the 2x CHAR_PER_VEC <= 32 we can combine the results from the 2x
VEC in a single GPR. */ VEC in a single GPR. */
# if CHAR_PER_VEC == 64 # if CHAR_PER_VEC == 64

View File

@ -29,7 +29,7 @@
3. Use xmm vector compare when size >= 4 bytes for memcmp or 3. Use xmm vector compare when size >= 4 bytes for memcmp or
size >= 8 bytes for wmemcmp. size >= 8 bytes for wmemcmp.
4. Optimistically compare up to first 4 * VEC_SIZE one at a 4. Optimistically compare up to first 4 * VEC_SIZE one at a
to check for early mismatches. Only do this if its guranteed the to check for early mismatches. Only do this if its guaranteed the
work is not wasted. work is not wasted.
5. If size is 8 * VEC_SIZE or less, unroll the loop. 5. If size is 8 * VEC_SIZE or less, unroll the loop.
6. Compare 4 * VEC_SIZE at a time with the aligned first memory 6. Compare 4 * VEC_SIZE at a time with the aligned first memory
@ -66,7 +66,7 @@
/* Warning! /* Warning!
wmemcmp has to use SIGNED comparison for elements. wmemcmp has to use SIGNED comparison for elements.
memcmp has to use UNSIGNED comparison for elemnts. memcmp has to use UNSIGNED comparison for elements.
*/ */
.section SECTION(.text),"ax",@progbits .section SECTION(.text),"ax",@progbits

View File

@ -30,7 +30,7 @@
3. Use xmm vector compare when size >= 4 bytes for memcmp or 3. Use xmm vector compare when size >= 4 bytes for memcmp or
size >= 8 bytes for wmemcmp. size >= 8 bytes for wmemcmp.
4. Optimistically compare up to first 4 * CHAR_PER_VEC one at a 4. Optimistically compare up to first 4 * CHAR_PER_VEC one at a
to check for early mismatches. Only do this if its guranteed the to check for early mismatches. Only do this if its guaranteed the
work is not wasted. work is not wasted.
5. If size is 8 * VEC_SIZE or less, unroll the loop. 5. If size is 8 * VEC_SIZE or less, unroll the loop.
6. Compare 4 * VEC_SIZE at a time with the aligned first memory 6. Compare 4 * VEC_SIZE at a time with the aligned first memory
@ -90,7 +90,7 @@ Latency:
/* Warning! /* Warning!
wmemcmp has to use SIGNED comparison for elements. wmemcmp has to use SIGNED comparison for elements.
memcmp has to use UNSIGNED comparison for elemnts. memcmp has to use UNSIGNED comparison for elements.
*/ */
.section SECTION(.text), "ax", @progbits .section SECTION(.text), "ax", @progbits
@ -105,7 +105,7 @@ ENTRY_P2ALIGN (MEMCMP, 6)
/* Fall through for [0, VEC_SIZE] as its the hottest. */ /* Fall through for [0, VEC_SIZE] as its the hottest. */
ja L(more_1x_vec) ja L(more_1x_vec)
/* Create mask of bytes that are guranteed to be valid because /* Create mask of bytes that are guaranteed to be valid because
of length (edx). Using masked movs allows us to skip checks of length (edx). Using masked movs allows us to skip checks
for page crosses/zero size. */ for page crosses/zero size. */
mov $-1, %VRAX mov $-1, %VRAX
@ -365,7 +365,7 @@ L(loop_4x_vec):
/* Load regardless of branch. */ /* Load regardless of branch. */
VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %VMM(3) VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %VMM(3)
/* Seperate logic as we can only use testb for VEC_SIZE == 64. /* Separate logic as we can only use testb for VEC_SIZE == 64.
*/ */
# if VEC_SIZE == 64 # if VEC_SIZE == 64
testb %dil, %dil testb %dil, %dil

View File

@ -410,7 +410,7 @@ L(ret_nonzero_vec_start_4_5):
.p2align 4,, 8 .p2align 4,, 8
L(ret_nonzero_vec_end_1): L(ret_nonzero_vec_end_1):
pmovmskb %xmm1, %ecx pmovmskb %xmm1, %ecx
/* High 16 bits of eax guranteed to be all ones. Rotate them in /* High 16 bits of eax guaranteed to be all ones. Rotate them in
to we can do `or + not` with just `xor`. */ to we can do `or + not` with just `xor`. */
rorl $16, %eax rorl $16, %eax
xorl %ecx, %eax xorl %ecx, %eax
@ -562,7 +562,7 @@ L(ret_nonzero_loop):
sall $(VEC_SIZE * 1), %edx sall $(VEC_SIZE * 1), %edx
leal 1(%rcx, %rdx), %edx leal 1(%rcx, %rdx), %edx
pmovmskb %xmm2, %ecx pmovmskb %xmm2, %ecx
/* High 16 bits of eax guranteed to be all ones. Rotate them in /* High 16 bits of eax guaranteed to be all ones. Rotate them in
to we can do `or + not` with just `xor`. */ to we can do `or + not` with just `xor`. */
rorl $16, %eax rorl $16, %eax
xorl %ecx, %eax xorl %ecx, %eax

View File

@ -26,7 +26,7 @@
and loading from either s1 or s2 would cause a page cross. and loading from either s1 or s2 would cause a page cross.
2. Use xmm vector compare when size >= 8 bytes. 2. Use xmm vector compare when size >= 8 bytes.
3. Optimistically compare up to first 4 * VEC_SIZE one at a 3. Optimistically compare up to first 4 * VEC_SIZE one at a
to check for early mismatches. Only do this if its guranteed the to check for early mismatches. Only do this if its guaranteed the
work is not wasted. work is not wasted.
4. If size is 8 * VEC_SIZE or less, unroll the loop. 4. If size is 8 * VEC_SIZE or less, unroll the loop.
5. Compare 4 * VEC_SIZE at a time with the aligned first memory 5. Compare 4 * VEC_SIZE at a time with the aligned first memory
@ -302,7 +302,7 @@ L(between_9_15):
movq -8(%rsi, %rdx), %rdi movq -8(%rsi, %rdx), %rdi
subq %rdi, %rcx subq %rdi, %rcx
orq %rcx, %rax orq %rcx, %rax
/* edx is guranteed to be a non-zero int. */ /* edx is guaranteed to be a non-zero int. */
cmovnz %edx, %eax cmovnz %edx, %eax
ret ret

View File

@ -26,7 +26,7 @@
and loading from either s1 or s2 would cause a page cross. and loading from either s1 or s2 would cause a page cross.
2. Use xmm vector compare when size >= 8 bytes. 2. Use xmm vector compare when size >= 8 bytes.
3. Optimistically compare up to first 4 * VEC_SIZE one at a 3. Optimistically compare up to first 4 * VEC_SIZE one at a
to check for early mismatches. Only do this if its guranteed the to check for early mismatches. Only do this if its guaranteed the
work is not wasted. work is not wasted.
4. If size is 8 * VEC_SIZE or less, unroll the loop. 4. If size is 8 * VEC_SIZE or less, unroll the loop.
5. Compare 4 * VEC_SIZE at a time with the aligned first memory 5. Compare 4 * VEC_SIZE at a time with the aligned first memory
@ -97,7 +97,7 @@ ENTRY_P2ALIGN (MEMCMPEQ, 6)
/* Fall through for [0, VEC_SIZE] as its the hottest. */ /* Fall through for [0, VEC_SIZE] as its the hottest. */
ja L(more_1x_vec) ja L(more_1x_vec)
/* Create mask of bytes that are guranteed to be valid because /* Create mask of bytes that are guaranteed to be valid because
of length (edx). Using masked movs allows us to skip checks of length (edx). Using masked movs allows us to skip checks
for page crosses/zero size. */ for page crosses/zero size. */
mov $-1, %VRAX mov $-1, %VRAX
@ -253,7 +253,7 @@ L(loop_4x_vec):
oring with VEC(4). Result is stored in VEC(4). */ oring with VEC(4). Result is stored in VEC(4). */
vpternlogd $0xf6, (VEC_SIZE * 2)(%rdx), %VMM(3), %VMM(4) vpternlogd $0xf6, (VEC_SIZE * 2)(%rdx), %VMM(3), %VMM(4)
/* Seperate logic as we can only use testb for VEC_SIZE == 64. /* Separate logic as we can only use testb for VEC_SIZE == 64.
*/ */
# if VEC_SIZE == 64 # if VEC_SIZE == 64
testb %dil, %dil testb %dil, %dil

View File

@ -231,7 +231,7 @@ L(end_loop_fwd):
movups %xmm7, 48(%r8) movups %xmm7, 48(%r8)
ret ret
/* Extactly 64 bytes if `jmp L(end_loop_fwd)` is long encoding. /* Exactly 64 bytes if `jmp L(end_loop_fwd)` is long encoding.
60 bytes otherwise. */ 60 bytes otherwise. */
# define ALIGNED_LOOP_FWD(align_by); \ # define ALIGNED_LOOP_FWD(align_by); \
.p2align 6; \ .p2align 6; \
@ -368,7 +368,7 @@ L(end_loop_bkwd):
ret ret
/* Extactly 64 bytes if `jmp L(end_loop_bkwd)` is long encoding. /* Exactly 64 bytes if `jmp L(end_loop_bkwd)` is long encoding.
60 bytes otherwise. */ 60 bytes otherwise. */
# define ALIGNED_LOOP_BKWD(align_by); \ # define ALIGNED_LOOP_BKWD(align_by); \
.p2align 6; \ .p2align 6; \

View File

@ -445,7 +445,7 @@ L(more_8x_vec_check):
shrq $63, %r8 shrq $63, %r8
/* Get 4k difference dst - src. */ /* Get 4k difference dst - src. */
andl $(PAGE_SIZE - 256), %ecx andl $(PAGE_SIZE - 256), %ecx
/* If r8 is non-zero must do foward for correctness. Otherwise /* If r8 is non-zero must do forward for correctness. Otherwise
if ecx is non-zero there is 4k False Alaising so do backward if ecx is non-zero there is 4k False Alaising so do backward
copy. */ copy. */
addl %r8d, %ecx addl %r8d, %ecx
@ -460,7 +460,7 @@ L(more_8x_vec_forward):
/* First vec was already loaded into VEC(0). */ /* First vec was already loaded into VEC(0). */
VMOVU -VEC_SIZE(%rsi, %rdx), %VMM(5) VMOVU -VEC_SIZE(%rsi, %rdx), %VMM(5)
VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(6) VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(6)
/* Save begining of dst. */ /* Save beginning of dst. */
movq %rdi, %rcx movq %rdi, %rcx
/* Align dst to VEC_SIZE - 1. */ /* Align dst to VEC_SIZE - 1. */
orq $(VEC_SIZE - 1), %rdi orq $(VEC_SIZE - 1), %rdi
@ -517,7 +517,7 @@ L(more_8x_vec_backward):
/* First vec was also loaded into VEC(0). */ /* First vec was also loaded into VEC(0). */
VMOVU VEC_SIZE(%rsi), %VMM(5) VMOVU VEC_SIZE(%rsi), %VMM(5)
VMOVU (VEC_SIZE * 2)(%rsi), %VMM(6) VMOVU (VEC_SIZE * 2)(%rsi), %VMM(6)
/* Begining of region for 4x backward copy stored in rcx. */ /* Beginning of region for 4x backward copy stored in rcx. */
leaq (VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx leaq (VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx
VMOVU (VEC_SIZE * 3)(%rsi), %VMM(7) VMOVU (VEC_SIZE * 3)(%rsi), %VMM(7)
VMOVU -VEC_SIZE(%rsi, %rdx), %VMM(8) VMOVU -VEC_SIZE(%rsi, %rdx), %VMM(8)
@ -611,7 +611,7 @@ L(movsb):
movq %rdi, %r8 movq %rdi, %r8
# endif # endif
/* If above __x86_rep_movsb_stop_threshold most likely is /* If above __x86_rep_movsb_stop_threshold most likely is
candidate for NT moves aswell. */ candidate for NT moves as well. */
cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
jae L(large_memcpy_2x_check) jae L(large_memcpy_2x_check)
# if AVOID_SHORT_DISTANCE_REP_MOVSB || ALIGN_MOVSB # if AVOID_SHORT_DISTANCE_REP_MOVSB || ALIGN_MOVSB

View File

@ -65,7 +65,7 @@ ENTRY_P2ALIGN(MEMRCHR, 6)
L(ret_vec_x0_test): L(ret_vec_x0_test):
/* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which /* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
will gurantee edx (len) is less than it. */ will guarantee edx (len) is less than it. */
lzcntl %ecx, %ecx lzcntl %ecx, %ecx
/* Hoist vzeroupper (not great for RTM) to save code size. This allows /* Hoist vzeroupper (not great for RTM) to save code size. This allows
@ -233,7 +233,7 @@ L(more_4x_vec):
jnz L(ret_vec_x3) jnz L(ret_vec_x3)
/* Check if near end before re-aligning (otherwise might do an /* Check if near end before re-aligning (otherwise might do an
unnecissary loop iteration). */ unnecessary loop iteration). */
addq $-(VEC_SIZE * 4), %rax addq $-(VEC_SIZE * 4), %rax
cmpq $(VEC_SIZE * 4), %rdx cmpq $(VEC_SIZE * 4), %rdx
jbe L(last_4x_vec) jbe L(last_4x_vec)

View File

@ -119,7 +119,7 @@ L(last_2x_vec):
# endif # endif
jle L(zero_2) jle L(zero_2)
/* We adjusted rax (length) for VEC_SIZE == 64 so need seperate /* We adjusted rax (length) for VEC_SIZE == 64 so need separate
offsets. */ offsets. */
# if VEC_SIZE == 64 # if VEC_SIZE == 64
vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0 vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0
@ -354,7 +354,7 @@ L(loop_4x_vec):
jnz L(first_vec_x1_end) jnz L(first_vec_x1_end)
KMOV %k2, %VRCX KMOV %k2, %VRCX
/* Seperate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for /* Separate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
returning last 2x VEC. For VEC_SIZE == 64 we test each VEC returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
individually, for VEC_SIZE == 32 we combine them in a single individually, for VEC_SIZE == 32 we combine them in a single
64-bit GPR. */ 64-bit GPR. */

View File

@ -50,7 +50,7 @@ ENTRY_P2ALIGN(MEMRCHR, 6)
jz L(page_cross) jz L(page_cross)
/* NB: This load happens regardless of whether rdx (len) is zero. Since /* NB: This load happens regardless of whether rdx (len) is zero. Since
it doesn't cross a page and the standard gurantees any pointer have it doesn't cross a page and the standard guarantees any pointer have
at least one-valid byte this load must be safe. For the entire at least one-valid byte this load must be safe. For the entire
history of the x86 memrchr implementation this has been possible so history of the x86 memrchr implementation this has been possible so
no code "should" be relying on a zero-length check before this load. no code "should" be relying on a zero-length check before this load.

View File

@ -199,7 +199,7 @@ L(less_vec_from_wmemset):
MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out. */ MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out. */
andl $(PAGE_SIZE - 1), %edi andl $(PAGE_SIZE - 1), %edi
/* Check if VEC_SIZE store cross page. Mask stores suffer /* Check if VEC_SIZE store cross page. Mask stores suffer
serious performance degradation when it has to fault supress. serious performance degradation when it has to fault suppress.
*/ */
cmpl $(PAGE_SIZE - VEC_SIZE), %edi cmpl $(PAGE_SIZE - VEC_SIZE), %edi
/* This is generally considered a cold target. */ /* This is generally considered a cold target. */

View File

@ -187,13 +187,13 @@ L(loop_4x_vec):
ymm0-15 is used at all is because there is no EVEX encoding ymm0-15 is used at all is because there is no EVEX encoding
vpcmpeq and with vpcmpeq this loop can be performed more vpcmpeq and with vpcmpeq this loop can be performed more
efficiently. The non-vzeroupper version is safe for RTM efficiently. The non-vzeroupper version is safe for RTM
while the vzeroupper version should be prefered if RTM are while the vzeroupper version should be preferred if RTM are
not supported. Which loop version we use is determined by not supported. Which loop version we use is determined by
USE_TERN_IN_LOOP. */ USE_TERN_IN_LOOP. */
# if USE_TERN_IN_LOOP # if USE_TERN_IN_LOOP
/* Since vptern can only take 3x vectors fastest to do 1 vec /* Since vptern can only take 3x vectors fastest to do 1 vec
seperately with EVEX vpcmp. */ separately with EVEX vpcmp. */
VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k1 VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k1
/* Compare 3x with vpcmpeq and or them all together with vptern. /* Compare 3x with vpcmpeq and or them all together with vptern.
*/ */
@ -256,7 +256,7 @@ L(loop_4x_vec):
(only if used VEX encoded loop). */ (only if used VEX encoded loop). */
COND_VZEROUPPER COND_VZEROUPPER
/* Seperate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for /* Separate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
returning last 2x VEC. For VEC_SIZE == 64 we test each VEC returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
individually, for VEC_SIZE == 32 we combine them in a single individually, for VEC_SIZE == 32 we combine them in a single
64-bit GPR. */ 64-bit GPR. */

View File

@ -163,7 +163,7 @@ ENTRY (STRCAT)
decl %ecx decl %ecx
jnz 21b jnz 21b
/* Now the sources is aligned. Unfortunatly we cannot force /* Now the sources is aligned. Unfortunately we cannot force
to have both source and destination aligned, so ignore the to have both source and destination aligned, so ignore the
alignment of the destination. */ alignment of the destination. */
.p2align 4 .p2align 4

View File

@ -1,4 +1,4 @@
/* strlen used for begining of str{n}cat using AVX2. /* strlen used for beginning of str{n}cat using AVX2.
Copyright (C) 2011-2023 Free Software Foundation, Inc. Copyright (C) 2011-2023 Free Software Foundation, Inc.
This file is part of the GNU C Library. This file is part of the GNU C Library.

View File

@ -1,4 +1,4 @@
/* strlen used for begining of str{n}cat using EVEX 256/512. /* strlen used for beginning of str{n}cat using EVEX 256/512.
Copyright (C) 2011-2023 Free Software Foundation, Inc. Copyright (C) 2011-2023 Free Software Foundation, Inc.
This file is part of the GNU C Library. This file is part of the GNU C Library.

View File

@ -160,7 +160,7 @@ L(last_vec_x2):
# endif # endif
L(first_vec_x1): L(first_vec_x1):
/* Use bsf here to save 1-byte keeping keeping the block in 1x /* Use bsf here to save 1-byte keeping keeping the block in 1x
fetch block. eax guranteed non-zero. */ fetch block. eax guaranteed non-zero. */
bsf %VRCX, %VRCX bsf %VRCX, %VRCX
# ifndef USE_AS_STRCHRNUL # ifndef USE_AS_STRCHRNUL
/* Found CHAR or the null byte. */ /* Found CHAR or the null byte. */
@ -294,7 +294,7 @@ L(loop_4x_vec):
/* Two methods for loop depending on VEC_SIZE. This is because /* Two methods for loop depending on VEC_SIZE. This is because
with zmm registers VPMINU can only run on p0 (as opposed to with zmm registers VPMINU can only run on p0 (as opposed to
p0/p1 for ymm) so it is less prefered. */ p0/p1 for ymm) so it is less preferred. */
# if VEC_SIZE == 32 # if VEC_SIZE == 32
/* For VEC_2 and VEC_3 use xor to set the CHARs matching esi to /* For VEC_2 and VEC_3 use xor to set the CHARs matching esi to
zero. */ zero. */
@ -340,7 +340,7 @@ L(loop_4x_vec):
esi, the corresponding bit in %k3 is zero so the esi, the corresponding bit in %k3 is zero so the
VPMINU_MASKZ will have a zero in the result). NB: This make VPMINU_MASKZ will have a zero in the result). NB: This make
the VPMINU 3c latency. The only way to avoid it is to the VPMINU 3c latency. The only way to avoid it is to
createa a 12c dependency chain on all the `VPCMP $4, ...` create a 12c dependency chain on all the `VPCMP $4, ...`
which has higher total latency. */ which has higher total latency. */
VPMINU %VMM(2), %VMM(4), %VMM(4){%k3}{z} VPMINU %VMM(2), %VMM(4), %VMM(4){%k3}{z}
# endif # endif
@ -366,7 +366,7 @@ L(loop_4x_vec):
# endif # endif
/* COND_MASK integates the esi matches for VEC_SIZE == 64. For /* COND_MASK integrates the esi matches for VEC_SIZE == 64. For
VEC_SIZE == 32 they are already integrated. */ VEC_SIZE == 32 they are already integrated. */
VPTEST %VMM(2), %VMM(2), %k0 COND_MASK(k2) VPTEST %VMM(2), %VMM(2), %k0 COND_MASK(k2)
KMOV %k0, %VRCX KMOV %k0, %VRCX
@ -403,7 +403,7 @@ L(zero_end):
# endif # endif
/* Seperate return label for last VEC1 because for VEC_SIZE == /* Separate return label for last VEC1 because for VEC_SIZE ==
32 we can reuse return code in L(page_cross) but VEC_SIZE == 32 we can reuse return code in L(page_cross) but VEC_SIZE ==
64 has mismatched registers. */ 64 has mismatched registers. */
# if VEC_SIZE == 64 # if VEC_SIZE == 64
@ -480,7 +480,7 @@ L(cross_page_boundary_real):
*/ */
xorl $((1 << CHAR_PER_VEC)- 1), %eax xorl $((1 << CHAR_PER_VEC)- 1), %eax
# endif # endif
/* Use arithmatic shift so that leading 1s are filled in. */ /* Use arithmetic shift so that leading 1s are filled in. */
sarx %VGPR(SHIFT_REG), %VRAX, %VRAX sarx %VGPR(SHIFT_REG), %VRAX, %VRAX
/* If eax is all ones then no matches for esi or NULL. */ /* If eax is all ones then no matches for esi or NULL. */

View File

@ -86,7 +86,7 @@ L(next_48_bytes):
jne L(return) jne L(return)
L(loop_start): L(loop_start):
/* We use this alignment to force loop be aligned to 8 but not /* We use this alignment to force loop be aligned to 8 but not
16 bytes. This gives better sheduling on AMD processors. */ 16 bytes. This gives better scheduling on AMD processors. */
.p2align 4 .p2align 4
pxor %xmm6, %xmm6 pxor %xmm6, %xmm6
andq $-64, %rdi andq $-64, %rdi

View File

@ -194,7 +194,7 @@ ENTRY (STRCASECMP)
movq __libc_tsd_LOCALE@gottpoff(%rip), %rax movq __libc_tsd_LOCALE@gottpoff(%rip), %rax
mov %fs:(%rax), %LOCALE_REG_LP mov %fs:(%rax), %LOCALE_REG_LP
/* Either 1 or 5 bytes (dependeing if CET is enabled). */ /* Either 1 or 5 bytes (depending if CET is enabled). */
.p2align 4 .p2align 4
END (STRCASECMP) END (STRCASECMP)
/* FALLTHROUGH to strcasecmp/strncasecmp_l. */ /* FALLTHROUGH to strcasecmp/strncasecmp_l. */
@ -501,7 +501,7 @@ L(more_3x_vec):
L(prepare_loop): L(prepare_loop):
# ifdef USE_AS_STRNCMP # ifdef USE_AS_STRNCMP
/* Store N + (VEC_SIZE * 4) and place check at the begining of /* Store N + (VEC_SIZE * 4) and place check at the beginning of
the loop. */ the loop. */
leaq (VEC_SIZE * 2)(%rdi, %rdx), %rdx leaq (VEC_SIZE * 2)(%rdi, %rdx), %rdx
# endif # endif
@ -762,7 +762,7 @@ L(page_cross_during_loop):
.p2align 4,, 4 .p2align 4,, 4
L(less_1x_vec_till_page_cross): L(less_1x_vec_till_page_cross):
subl $-(VEC_SIZE * 4), %eax subl $-(VEC_SIZE * 4), %eax
/* Guranteed safe to read from rdi - VEC_SIZE here. The only /* Guaranteed safe to read from rdi - VEC_SIZE here. The only
concerning case is first iteration if incoming s1 was near start concerning case is first iteration if incoming s1 was near start
of a page and s2 near end. If s1 was near the start of the page of a page and s2 near end. If s1 was near the start of the page
we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe
@ -948,7 +948,7 @@ L(ret9):
L(page_cross): L(page_cross):
# ifndef USE_AS_STRNCMP # ifndef USE_AS_STRNCMP
/* If both are VEC aligned we don't need any special logic here. /* If both are VEC aligned we don't need any special logic here.
Only valid for strcmp where stop condition is guranteed to be Only valid for strcmp where stop condition is guaranteed to be
reachable by just reading memory. */ reachable by just reading memory. */
testl $((VEC_SIZE - 1) << 20), %eax testl $((VEC_SIZE - 1) << 20), %eax
jz L(no_page_cross) jz L(no_page_cross)
@ -984,7 +984,7 @@ L(page_cross):
subl $(VEC_SIZE * 3), %eax subl $(VEC_SIZE * 3), %eax
jg L(less_1x_vec_till_page) jg L(less_1x_vec_till_page)
/* If more than 1x VEC till page cross, loop throuh safely /* If more than 1x VEC till page cross, loop through safely
loadable memory until within 1x VEC of page cross. */ loadable memory until within 1x VEC of page cross. */
.p2align 4,, 10 .p2align 4,, 10
@ -1007,9 +1007,9 @@ L(page_cross_loop):
jl L(page_cross_loop) jl L(page_cross_loop)
subl %eax, %OFFSET_REG subl %eax, %OFFSET_REG
/* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed /* OFFSET_REG has distance to page cross - VEC_SIZE. Guaranteed
to not cross page so is safe to load. Since we have already to not cross page so is safe to load. Since we have already
loaded at least 1 VEC from rsi it is also guranteed to be loaded at least 1 VEC from rsi it is also guaranteed to be
safe. */ safe. */
VMOVU (%rdi, %OFFSET_REG64), %ymm0 VMOVU (%rdi, %OFFSET_REG64), %ymm0

View File

@ -217,7 +217,7 @@ ENTRY (STRCASECMP)
movq __libc_tsd_LOCALE@gottpoff(%rip), %rax movq __libc_tsd_LOCALE@gottpoff(%rip), %rax
mov %fs:(%rax), %LOCALE_REG_LP mov %fs:(%rax), %LOCALE_REG_LP
/* Either 1 or 5 bytes (dependeing if CET is enabled). */ /* Either 1 or 5 bytes (depending if CET is enabled). */
.p2align 4 .p2align 4
END (STRCASECMP) END (STRCASECMP)
/* FALLTHROUGH to strcasecmp/strncasecmp_l. */ /* FALLTHROUGH to strcasecmp/strncasecmp_l. */
@ -455,7 +455,7 @@ L(return_vec_3):
# endif # endif
/* If CHAR_PER_VEC == 64 we can't combine matches from the last /* If CHAR_PER_VEC == 64 we can't combine matches from the last
2x VEC so need seperate return label. */ 2x VEC so need separate return label. */
L(return_vec_2): L(return_vec_2):
# if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP) # if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
bsf %VRCX, %VRCX bsf %VRCX, %VRCX
@ -567,7 +567,7 @@ L(prepare_loop_no_len):
shrl $2, %ecx shrl $2, %ecx
leaq (CHAR_PER_VEC * 2)(%rdx, %rcx), %rdx leaq (CHAR_PER_VEC * 2)(%rdx, %rcx), %rdx
# else # else
/* Store N + (VEC_SIZE * 4) and place check at the begining of /* Store N + (VEC_SIZE * 4) and place check at the beginning of
the loop. */ the loop. */
leaq (VEC_SIZE * 2)(%rdi, %rdx), %rdx leaq (VEC_SIZE * 2)(%rdi, %rdx), %rdx
L(prepare_loop_no_len): L(prepare_loop_no_len):
@ -840,7 +840,7 @@ L(ret7):
/* If CHAR_PER_VEC == 64 we can't combine matches from the last /* If CHAR_PER_VEC == 64 we can't combine matches from the last
2x VEC so need seperate return label. */ 2x VEC so need separate return label. */
# if CHAR_PER_VEC == 64 # if CHAR_PER_VEC == 64
L(return_vec_2_end): L(return_vec_2_end):
bsf %VRCX, %VRCX bsf %VRCX, %VRCX
@ -906,7 +906,7 @@ L(page_cross_during_loop):
.p2align 4,, 4 .p2align 4,, 4
L(less_1x_vec_till_page_cross): L(less_1x_vec_till_page_cross):
subl $-(VEC_SIZE * 4), %eax subl $-(VEC_SIZE * 4), %eax
/* Guranteed safe to read from rdi - VEC_SIZE here. The only /* Guaranteed safe to read from rdi - VEC_SIZE here. The only
concerning case is first iteration if incoming s1 was near start concerning case is first iteration if incoming s1 was near start
of a page and s2 near end. If s1 was near the start of the page of a page and s2 near end. If s1 was near the start of the page
we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe
@ -997,7 +997,7 @@ L(return_page_cross_end_check):
and %VR10, %VRCX and %VR10, %VRCX
/* Need to use tzcnt here as VRCX may be zero. If VRCX is zero /* Need to use tzcnt here as VRCX may be zero. If VRCX is zero
tzcnt(VRCX) will be CHAR_PER and remaining length (edx) is tzcnt(VRCX) will be CHAR_PER and remaining length (edx) is
guranteed to be <= CHAR_PER_VEC so we will only use the return guaranteed to be <= CHAR_PER_VEC so we will only use the return
idx if VRCX was non-zero. */ idx if VRCX was non-zero. */
tzcnt %VRCX, %VRCX tzcnt %VRCX, %VRCX
leal -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx leal -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
@ -1147,7 +1147,7 @@ L(ret9):
L(page_cross): L(page_cross):
# ifndef USE_AS_STRNCMP # ifndef USE_AS_STRNCMP
/* If both are VEC aligned we don't need any special logic here. /* If both are VEC aligned we don't need any special logic here.
Only valid for strcmp where stop condition is guranteed to Only valid for strcmp where stop condition is guaranteed to
be reachable by just reading memory. */ be reachable by just reading memory. */
testl $((VEC_SIZE - 1) << 20), %eax testl $((VEC_SIZE - 1) << 20), %eax
jz L(no_page_cross) jz L(no_page_cross)
@ -1185,7 +1185,7 @@ L(page_cross):
jg L(less_1x_vec_till_page) jg L(less_1x_vec_till_page)
/* If more than 1x VEC till page cross, loop throuh safely /* If more than 1x VEC till page cross, loop through safely
loadable memory until within 1x VEC of page cross. */ loadable memory until within 1x VEC of page cross. */
.p2align 4,, 8 .p2align 4,, 8
L(page_cross_loop): L(page_cross_loop):
@ -1209,9 +1209,9 @@ L(page_cross_loop):
subl %eax, %OFFSET_REG subl %eax, %OFFSET_REG
/* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed /* OFFSET_REG has distance to page cross - VEC_SIZE. Guaranteed
to not cross page so is safe to load. Since we have already to not cross page so is safe to load. Since we have already
loaded at least 1 VEC from rsi it is also guranteed to be loaded at least 1 VEC from rsi it is also guaranteed to be
safe. */ safe. */
VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(0) VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(0)
VPTESTM %VMM(0), %VMM(0), %k2 VPTESTM %VMM(0), %VMM(0), %k2

View File

@ -20,7 +20,7 @@
/* Continue building as ISA level 2. We use this as ISA V2 default /* Continue building as ISA level 2. We use this as ISA V2 default
because strcmp-sse42 uses pcmpstri (slow on some SSE4.2 because strcmp-sse42 uses pcmpstri (slow on some SSE4.2
processors) and this implementation is potenially faster than processors) and this implementation is potentially faster than
strcmp-sse42 (aside from the slower page cross case). */ strcmp-sse42 (aside from the slower page cross case). */
#if ISA_SHOULD_BUILD (2) #if ISA_SHOULD_BUILD (2)

View File

@ -75,7 +75,7 @@ ENTRY2 (STRCASECMP)
movq __libc_tsd_LOCALE@gottpoff(%rip),%rax movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
mov %fs:(%rax),%RDX_LP mov %fs:(%rax),%RDX_LP
/* Either 1 or 5 bytes (dependeing if CET is enabled). */ /* Either 1 or 5 bytes (depending if CET is enabled). */
.p2align 4 .p2align 4
END2 (STRCASECMP) END2 (STRCASECMP)
/* FALLTHROUGH to strcasecmp_l. */ /* FALLTHROUGH to strcasecmp_l. */
@ -89,7 +89,7 @@ ENTRY2 (STRCASECMP)
movq __libc_tsd_LOCALE@gottpoff(%rip),%rax movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
mov %fs:(%rax),%RCX_LP mov %fs:(%rax),%RCX_LP
/* Either 1 or 5 bytes (dependeing if CET is enabled). */ /* Either 1 or 5 bytes (depending if CET is enabled). */
.p2align 4 .p2align 4
END2 (STRCASECMP) END2 (STRCASECMP)
/* FALLTHROUGH to strncasecmp_l. */ /* FALLTHROUGH to strncasecmp_l. */
@ -186,7 +186,7 @@ ENTRY (STRCMP)
jnz LABEL(less16bytes) /* If not, find different value or null char */ jnz LABEL(less16bytes) /* If not, find different value or null char */
# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
sub $16, %r11 sub $16, %r11
jbe LABEL(strcmp_exitz) /* finish comparision */ jbe LABEL(strcmp_exitz) /* finish comparison */
# endif # endif
add $16, %rsi /* prepare to search next 16 bytes */ add $16, %rsi /* prepare to search next 16 bytes */
add $16, %rdi /* prepare to search next 16 bytes */ add $16, %rdi /* prepare to search next 16 bytes */
@ -400,7 +400,7 @@ LABEL(nibble_ashr_1):
# endif # endif
pxor %xmm0, %xmm0 pxor %xmm0, %xmm0
sub $0x1000, %r10 /* substract 4K from %r10 */ sub $0x1000, %r10 /* subtract 4K from %r10 */
jmp LABEL(gobble_ashr_1) jmp LABEL(gobble_ashr_1)
/* /*

View File

@ -84,7 +84,7 @@ ENTRY (STRCASECMP)
movq __libc_tsd_LOCALE@gottpoff(%rip),%rax movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
mov %fs:(%rax),%RDX_LP mov %fs:(%rax),%RDX_LP
/* Either 1 or 5 bytes (dependeing if CET is enabled). */ /* Either 1 or 5 bytes (depending if CET is enabled). */
.p2align 4 .p2align 4
END (STRCASECMP) END (STRCASECMP)
/* FALLTHROUGH to strcasecmp_l. */ /* FALLTHROUGH to strcasecmp_l. */
@ -94,7 +94,7 @@ ENTRY (STRCASECMP)
movq __libc_tsd_LOCALE@gottpoff(%rip),%rax movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
mov %fs:(%rax),%RCX_LP mov %fs:(%rax),%RCX_LP
/* Either 1 or 5 bytes (dependeing if CET is enabled). */ /* Either 1 or 5 bytes (depending if CET is enabled). */
.p2align 4 .p2align 4
END (STRCASECMP) END (STRCASECMP)
/* FALLTHROUGH to strncasecmp_l. */ /* FALLTHROUGH to strncasecmp_l. */

View File

@ -50,7 +50,7 @@ ENTRY (STRCPY)
5: 5:
movq $0xfefefefefefefeff,%r8 movq $0xfefefefefefefeff,%r8
/* Now the sources is aligned. Unfortunatly we cannot force /* Now the sources is aligned. Unfortunately we cannot force
to have both source and destination aligned, so ignore the to have both source and destination aligned, so ignore the
alignment of the destination. */ alignment of the destination. */
.p2align 4 .p2align 4

View File

@ -224,7 +224,7 @@ L(cross_page_continue):
since data is only aligned to VEC_SIZE. */ since data is only aligned to VEC_SIZE. */
# ifdef USE_AS_STRNLEN # ifdef USE_AS_STRNLEN
/* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
because it simplies the logic in last_4x_vec_or_less. */ because it simplifies the logic in last_4x_vec_or_less. */
leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
subq %rdx, %rcx subq %rdx, %rcx
# ifdef USE_AS_WCSLEN # ifdef USE_AS_WCSLEN

View File

@ -236,7 +236,7 @@ L(more_1x_vec):
VMOVU %VMM(0), (%rdi) VMOVU %VMM(0), (%rdi)
/* We are going to align rsi here so will need to be able to re- /* We are going to align rsi here so will need to be able to re-
adjust rdi/rdx afterwords. NB: We filtered out huge lengths adjust rdi/rdx afterwards. NB: We filtered out huge lengths
so rsi + rdx * CHAR_SIZE cannot overflow. */ so rsi + rdx * CHAR_SIZE cannot overflow. */
leaq (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx leaq (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx

View File

@ -99,7 +99,7 @@ L(page_cross_continue):
/* `jb` because length rdx is now length - CHAR_SIZE. */ /* `jb` because length rdx is now length - CHAR_SIZE. */
jbe L(less_1x_vec) jbe L(less_1x_vec)
/* This may overset but thats fine because we still need to zero /* This may overset but that's fine because we still need to zero
fill. */ fill. */
VMOVU %VMM(0), (%rdi) VMOVU %VMM(0), (%rdi)

View File

@ -130,7 +130,7 @@ L(page_cross_continue):
jae L(more_1x_vec) jae L(more_1x_vec)
/* If there where multiple zero-CHAR matches in the first VEC, /* If there where multiple zero-CHAR matches in the first VEC,
VRCX will be overset but thats fine since any oversets where VRCX will be overset but that's fine since any oversets where
at zero-positions anyways. */ at zero-positions anyways. */
# ifdef USE_AS_STPCPY # ifdef USE_AS_STPCPY
@ -177,7 +177,7 @@ L(more_1x_vec):
# endif # endif
/* This may overset but thats fine because we still need to zero /* This may overset but that's fine because we still need to zero
fill. */ fill. */
VMOVU %VMM(0), (%rdi) VMOVU %VMM(0), (%rdi)
@ -189,7 +189,7 @@ L(more_1x_vec):
/* We are going to align rsi here so will need to be able to re- /* We are going to align rsi here so will need to be able to re-
adjust rdi/rdx afterwords. NB: We filtered out huge lengths adjust rdi/rdx afterwards. NB: We filtered out huge lengths
so rsi + rdx * CHAR_SIZE cannot overflow. */ so rsi + rdx * CHAR_SIZE cannot overflow. */
leaq (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx leaq (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
subq %rsi, %rdi subq %rsi, %rdi
@ -221,7 +221,7 @@ L(last_2x_vec):
cmpl $(CHAR_PER_VEC), %edx cmpl $(CHAR_PER_VEC), %edx
jb L(ret_vec_x1_len) jb L(ret_vec_x1_len)
/* Seperate logic for CHAR_PER_VEC == 64 because we already did /* Separate logic for CHAR_PER_VEC == 64 because we already did
`tzcnt` on VRCX. */ `tzcnt` on VRCX. */
# if CHAR_PER_VEC == 64 # if CHAR_PER_VEC == 64
/* cl == CHAR_PER_VEC iff it was zero before the `tzcnt`. */ /* cl == CHAR_PER_VEC iff it was zero before the `tzcnt`. */
@ -296,7 +296,7 @@ L(ret_vec_x1_no_bsf):
.p2align 4,, 8 .p2align 4,, 8
L(last_4x_vec): L(last_4x_vec):
/* Seperate logic for CHAR_PER_VEC == 64 because we can do `andl /* Separate logic for CHAR_PER_VEC == 64 because we can do `andl
$(CHAR_PER_VEC * 4 - 1), %edx` with less code size just $(CHAR_PER_VEC * 4 - 1), %edx` with less code size just
using `movzbl`. */ using `movzbl`. */
# if CHAR_PER_VEC == 64 # if CHAR_PER_VEC == 64
@ -677,7 +677,7 @@ L(copy_16_31):
vmovdqu %xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) vmovdqu %xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
cmpl %ecx, %edx cmpl %ecx, %edx
/* Seperate logic depending on VEC_SIZE. If VEC_SIZE == 64 then /* Separate logic depending on VEC_SIZE. If VEC_SIZE == 64 then
we have a larger copy block for 32-63 so this is just falls we have a larger copy block for 32-63 so this is just falls
through to zfill 16-31. If VEC_SIZE == 32 then we check for through to zfill 16-31. If VEC_SIZE == 32 then we check for
full zfill of less 1x VEC. */ full zfill of less 1x VEC. */

View File

@ -336,7 +336,7 @@ L(loop_last_4x_vec):
VPTESTN %VMM(3), %VMM(3), %k0 VPTESTN %VMM(3), %VMM(3), %k0
/* Seperate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for /* Separate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
returning last 2x VEC. For VEC_SIZE == 64 we test each VEC returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
individually, for VEC_SIZE == 32 we combine them in a single individually, for VEC_SIZE == 32 we combine them in a single
64-bit GPR. */ 64-bit GPR. */

View File

@ -176,7 +176,7 @@ L(aligned_more):
.p2align 4 .p2align 4
L(first_aligned_loop): L(first_aligned_loop):
/* Do 2x VEC at a time. Any more and the cost of finding the /* Do 2x VEC at a time. Any more and the cost of finding the
match outweights loop benefit. */ match outweighs loop benefit. */
vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4 vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4
vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5 vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5
@ -324,7 +324,7 @@ L(cross_page):
vmovdqu (%rsi), %ymm1 vmovdqu (%rsi), %ymm1
VPCMPEQ %ymm1, %ymm0, %ymm6 VPCMPEQ %ymm1, %ymm0, %ymm6
vpmovmskb %ymm6, %ecx vpmovmskb %ymm6, %ecx
/* Shift out zero CHAR matches that are before the begining of /* Shift out zero CHAR matches that are before the beginning of
src (rdi). */ src (rdi). */
shrxl %edi, %ecx, %ecx shrxl %edi, %ecx, %ecx
testl %ecx, %ecx testl %ecx, %ecx
@ -332,7 +332,7 @@ L(cross_page):
VPCMPEQ %ymm1, %ymm7, %ymm1 VPCMPEQ %ymm1, %ymm7, %ymm1
vpmovmskb %ymm1, %eax vpmovmskb %ymm1, %eax
/* Shift out search CHAR matches that are before the begining of /* Shift out search CHAR matches that are before the beginning of
src (rdi). */ src (rdi). */
shrxl %edi, %eax, %eax shrxl %edi, %eax, %eax
blsmskl %ecx, %ecx blsmskl %ecx, %ecx

View File

@ -152,7 +152,7 @@ L(loop):
jnz L(loop_vec_x2_match) jnz L(loop_vec_x2_match)
KMOV %k1, %VRDX KMOV %k1, %VRDX
/* Match is in first vector, rdi offset need to be substracted /* Match is in first vector, rdi offset need to be subtracted
by VEC_SIZE. */ by VEC_SIZE. */
sub $VEC_SIZE, %r8 sub $VEC_SIZE, %r8
@ -216,7 +216,7 @@ L(check_last_match):
ret ret
/* No match recorded in r8. Check the second saved vector /* No match recorded in r8. Check the second saved vector
in begining. */ in beginning. */
L(vector_x2_ret): L(vector_x2_ret):
VPCMPEQ %VMM(2), %VMM(0), %k2 VPCMPEQ %VMM(2), %VMM(0), %k2
KMOV %k2, %VRAX KMOV %k2, %VRAX

View File

@ -139,7 +139,7 @@ L(first_vec_x1_or_x2):
KORTEST %k2, %k3 KORTEST %k2, %k3
jz L(first_vec_x0_test) jz L(first_vec_x0_test)
/* Guranteed that VEC(2) and VEC(3) are within range so merge /* Guaranteed that VEC(2) and VEC(3) are within range so merge
the two bitmasks then get last result. */ the two bitmasks then get last result. */
kunpck_2x %k2, %k3, %k3 kunpck_2x %k2, %k3, %k3
kmov_2x %k3, %maskm_2x kmov_2x %k3, %maskm_2x
@ -192,7 +192,7 @@ L(first_vec_x2):
.p2align 4,, 12 .p2align 4,, 12
L(aligned_more): L(aligned_more):
/* Need to keep original pointer incase VEC(1) has last match. /* Need to keep original pointer in case VEC(1) has last match.
*/ */
movq %rdi, %rsi movq %rdi, %rsi
andq $-VEC_SIZE, %rdi andq $-VEC_SIZE, %rdi
@ -222,7 +222,7 @@ L(aligned_more):
.p2align 4,, 10 .p2align 4,, 10
L(first_aligned_loop): L(first_aligned_loop):
/* Preserve VEC(1), VEC(2), VEC(3), and VEC(4) until we can /* Preserve VEC(1), VEC(2), VEC(3), and VEC(4) until we can
gurantee they don't store a match. */ guarantee they don't store a match. */
VMOVA (VEC_SIZE * 4)(%rdi), %VMM(5) VMOVA (VEC_SIZE * 4)(%rdi), %VMM(5)
VMOVA (VEC_SIZE * 5)(%rdi), %VMM(6) VMOVA (VEC_SIZE * 5)(%rdi), %VMM(6)
@ -285,7 +285,7 @@ L(second_aligned_loop_prep):
L(second_aligned_loop_set_furthest_match): L(second_aligned_loop_set_furthest_match):
movq %rdi, %rsi movq %rdi, %rsi
/* Ideally we would safe k2/k3 but `kmov/kunpck` take uops on /* Ideally we would safe k2/k3 but `kmov/kunpck` take uops on
port0 and have noticable overhead in the loop. */ port0 and have noticeable overhead in the loop. */
VMOVA %VMM(5), %VMM(7) VMOVA %VMM(5), %VMM(7)
VMOVA %VMM(6), %VMM(8) VMOVA %VMM(6), %VMM(8)
.p2align 4 .p2align 4
@ -351,7 +351,7 @@ L(cross_page_boundary):
/* eax contains all the page offset bits of src (rdi). `xor rdi, /* eax contains all the page offset bits of src (rdi). `xor rdi,
rax` sets pointer will all page offset bits cleared so rax` sets pointer will all page offset bits cleared so
offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
before page cross (guranteed to be safe to read). Doing this before page cross (guaranteed to be safe to read). Doing this
as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
a bit of code size. */ a bit of code size. */
xorq %rdi, %rax xorq %rdi, %rax
@ -359,7 +359,7 @@ L(cross_page_boundary):
VPTESTN %VMM(1), %VMM(1), %k0 VPTESTN %VMM(1), %VMM(1), %k0
KMOV %k0, %VRCX KMOV %k0, %VRCX
/* Shift out zero CHAR matches that are before the begining of /* Shift out zero CHAR matches that are before the beginning of
src (rdi). */ src (rdi). */
# ifdef USE_AS_WCSRCHR # ifdef USE_AS_WCSRCHR
movl %edi, %esi movl %edi, %esi
@ -374,7 +374,7 @@ L(cross_page_boundary):
/* Found zero CHAR so need to test for search CHAR. */ /* Found zero CHAR so need to test for search CHAR. */
VPCMP $0, %VMATCH, %VMM(1), %k1 VPCMP $0, %VMATCH, %VMM(1), %k1
KMOV %k1, %VRAX KMOV %k1, %VRAX
/* Shift out search CHAR matches that are before the begining of /* Shift out search CHAR matches that are before the beginning of
src (rdi). */ src (rdi). */
shrx %VGPR(SHIFT_REG), %VRAX, %VRAX shrx %VGPR(SHIFT_REG), %VRAX, %VRAX

View File

@ -166,7 +166,7 @@ L(first_loop):
/* Do 2x VEC at a time. */ /* Do 2x VEC at a time. */
movaps (VEC_SIZE * 2)(%rdi), %xmm4 movaps (VEC_SIZE * 2)(%rdi), %xmm4
movaps (VEC_SIZE * 3)(%rdi), %xmm5 movaps (VEC_SIZE * 3)(%rdi), %xmm5
/* Since SSE2 no pminud so wcsrchr needs seperate logic for /* Since SSE2 no pminud so wcsrchr needs separate logic for
detecting zero. Note if this is found to be a bottleneck it detecting zero. Note if this is found to be a bottleneck it
may be worth adding an SSE4.1 wcsrchr implementation. */ may be worth adding an SSE4.1 wcsrchr implementation. */
# ifdef USE_AS_WCSRCHR # ifdef USE_AS_WCSRCHR
@ -238,7 +238,7 @@ L(new_match):
/* We can't reuse either of the old comparisons as since we mask /* We can't reuse either of the old comparisons as since we mask
of zeros after first zero (instead of using the full of zeros after first zero (instead of using the full
comparison) we can't gurantee no interference between match comparison) we can't guarantee no interference between match
after end of string and valid match. */ after end of string and valid match. */
pmovmskb %xmm4, %eax pmovmskb %xmm4, %eax
pmovmskb %xmm7, %edx pmovmskb %xmm7, %edx
@ -268,7 +268,7 @@ L(second_loop_match):
L(second_loop): L(second_loop):
movaps (VEC_SIZE * 2)(%rdi), %xmm4 movaps (VEC_SIZE * 2)(%rdi), %xmm4
movaps (VEC_SIZE * 3)(%rdi), %xmm5 movaps (VEC_SIZE * 3)(%rdi), %xmm5
/* Since SSE2 no pminud so wcsrchr needs seperate logic for /* Since SSE2 no pminud so wcsrchr needs separate logic for
detecting zero. Note if this is found to be a bottleneck it detecting zero. Note if this is found to be a bottleneck it
may be worth adding an SSE4.1 wcsrchr implementation. */ may be worth adding an SSE4.1 wcsrchr implementation. */
# ifdef USE_AS_WCSRCHR # ifdef USE_AS_WCSRCHR
@ -297,11 +297,11 @@ L(second_loop):
pmovmskb %xmm6, %eax pmovmskb %xmm6, %eax
addq $(VEC_SIZE * 2), %rdi addq $(VEC_SIZE * 2), %rdi
/* Either null term or new occurence of CHAR. */ /* Either null term or new occurrence of CHAR. */
addl %ecx, %eax addl %ecx, %eax
jz L(second_loop) jz L(second_loop)
/* No null term so much be new occurence of CHAR. */ /* No null term so much be new occurrence of CHAR. */
testl %ecx, %ecx testl %ecx, %ecx
jz L(second_loop_match) jz L(second_loop_match)
@ -331,7 +331,7 @@ L(second_loop_new_match):
/* We can't reuse either of the old comparisons as since we mask /* We can't reuse either of the old comparisons as since we mask
of zeros after first zero (instead of using the full of zeros after first zero (instead of using the full
comparison) we can't gurantee no interference between match comparison) we can't guarantee no interference between match
after end of string and valid match. */ after end of string and valid match. */
pmovmskb %xmm4, %eax pmovmskb %xmm4, %eax
pmovmskb %xmm7, %edx pmovmskb %xmm7, %edx

View File

@ -140,7 +140,7 @@ __strstr_avx512 (const char *haystack, const char *ned)
= cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask, hay0, hay0)); = cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask, hay0, hay0));
uint64_t cmpmask = nullmask ^ (nullmask - ONE_64BIT); uint64_t cmpmask = nullmask ^ (nullmask - ONE_64BIT);
cmpmask = cmpmask & cvtmask64_u64 (loadmask); cmpmask = cmpmask & cvtmask64_u64 (loadmask);
/* Search for the 2 charaters of needle */ /* Search for the 2 characters of needle */
__mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0); __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
__mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1); __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
k1 = kshiftri_mask64 (k1, 1); k1 = kshiftri_mask64 (k1, 1);