Fix misspellings in sysdeps/x86_64 -- BZ 25337.

Applying this commit results in bit-identical rebuild of libc.so.6
math/libm.so.6 elf/ld-linux-x86-64.so.2 mathvec/libmvec.so.1

Reviewed-by: Florian Weimer <fweimer@redhat.com>
This commit is contained in:
Paul Pluzhnikov 2023-05-23 03:57:01 +00:00
parent ec9a66cd01
commit 1e9d5987fd
37 changed files with 105 additions and 105 deletions

View File

@ -28,11 +28,11 @@
# undef BASE
# if (STATE_SAVE_ALIGNMENT % 16) != 0
# error STATE_SAVE_ALIGNMENT must be multples of 16
# error STATE_SAVE_ALIGNMENT must be multiple of 16
# endif
# if (STATE_SAVE_OFFSET % STATE_SAVE_ALIGNMENT) != 0
# error STATE_SAVE_OFFSET must be multples of STATE_SAVE_ALIGNMENT
# error STATE_SAVE_OFFSET must be multiple of STATE_SAVE_ALIGNMENT
# endif
# if DL_RUNTIME_RESOLVE_REALIGN_STACK
@ -43,7 +43,7 @@
/* Use fxsave to save XMM registers. */
# define REGISTER_SAVE_AREA (512 + STATE_SAVE_OFFSET)
# if (REGISTER_SAVE_AREA % 16) != 0
# error REGISTER_SAVE_AREA must be multples of 16
# error REGISTER_SAVE_AREA must be multiple of 16
# endif
# endif
# else
@ -57,7 +57,7 @@
# define LOCAL_STORAGE_AREA REGISTER_SAVE_AREA
# define BASE rsp
# if (REGISTER_SAVE_AREA % 16) != 8
# error REGISTER_SAVE_AREA must be odd multples of 8
# error REGISTER_SAVE_AREA must be odd multiple of 8
# endif
# endif
@ -161,7 +161,7 @@ _dl_runtime_resolve:
#if !defined PROF && defined _dl_runtime_profile
# if (LR_VECTOR_OFFSET % VEC_SIZE) != 0
# error LR_VECTOR_OFFSET must be multples of VEC_SIZE
# error LR_VECTOR_OFFSET must be multiple of VEC_SIZE
# endif
.globl _dl_runtime_profile
@ -173,7 +173,7 @@ _dl_runtime_profile:
cfi_adjust_cfa_offset(16) # Incorporate PLT
_CET_ENDBR
/* The La_x86_64_regs data structure pointed to by the
fourth paramater must be VEC_SIZE-byte aligned. This must
fourth parameter must be VEC_SIZE-byte aligned. This must
be explicitly enforced. We have the set up a dynamically
sized stack frame. %rbx points to the top half which
has a fixed size and preserves the original stack pointer. */

View File

@ -31,7 +31,7 @@ __feupdateenv (const fenv_t *envp)
/* Install new environment. */
__fesetenv (envp);
/* Raise the saved exception. Incidently for us the implementation
/* Raise the saved exception. Incidentally for us the implementation
defined format of the values in objects of type fexcept_t is the
same as the ones specified using the FE_* constants. */
__feraiseexcept ((int) temp);

View File

@ -1,4 +1,4 @@
/* Common definition for strcasecmp famly ifunc selections.
/* Common definition for strcasecmp family ifunc selections.
All versions must be listed in ifunc-impl-list.c.
Copyright (C) 2017-2023 Free Software Foundation, Inc.
This file is part of the GNU C Library.

View File

@ -440,13 +440,13 @@ L(loop_4x_vec):
ymm0-15 is used at all is because there is no EVEX encoding
vpcmpeq and with vpcmpeq this loop can be performed more
efficiently. The non-vzeroupper version is safe for RTM
while the vzeroupper version should be prefered if RTM are
while the vzeroupper version should be preferred if RTM are
not supported. Which loop version we use is determined by
USE_TERN_IN_LOOP. */
# if USE_TERN_IN_LOOP
/* Since vptern can only take 3x vectors fastest to do 1 vec
seperately with EVEX vpcmp. */
separately with EVEX vpcmp. */
# ifdef USE_AS_WMEMCHR
/* vptern can only accept masks for epi32/epi64 so can only save
instruction using not equals mask on vptern with wmemchr.
@ -539,7 +539,7 @@ L(last_vec_x1_novzero):
# if CHAR_PER_VEC == 64
/* Since we can't combine the last 2x VEC when CHAR_PER_VEC ==
64 it needs a seperate return label. */
64 it needs a separate return label. */
.p2align 4,, 4
L(last_vec_x2):
L(last_vec_x2_novzero):
@ -579,8 +579,8 @@ L(loop_vec_ret):
(only if used VEX encoded loop). */
COND_VZEROUPPER
/* Seperate logic for CHAR_PER_VEC == 64 vs the rest. For
CHAR_PER_VEC we test the last 2x VEC seperately, for
/* Separate logic for CHAR_PER_VEC == 64 vs the rest. For
CHAR_PER_VEC we test the last 2x VEC separately, for
CHAR_PER_VEC <= 32 we can combine the results from the 2x
VEC in a single GPR. */
# if CHAR_PER_VEC == 64

View File

@ -29,7 +29,7 @@
3. Use xmm vector compare when size >= 4 bytes for memcmp or
size >= 8 bytes for wmemcmp.
4. Optimistically compare up to first 4 * VEC_SIZE one at a
to check for early mismatches. Only do this if its guranteed the
to check for early mismatches. Only do this if its guaranteed the
work is not wasted.
5. If size is 8 * VEC_SIZE or less, unroll the loop.
6. Compare 4 * VEC_SIZE at a time with the aligned first memory
@ -66,7 +66,7 @@
/* Warning!
wmemcmp has to use SIGNED comparison for elements.
memcmp has to use UNSIGNED comparison for elemnts.
memcmp has to use UNSIGNED comparison for elements.
*/
.section SECTION(.text),"ax",@progbits

View File

@ -30,7 +30,7 @@
3. Use xmm vector compare when size >= 4 bytes for memcmp or
size >= 8 bytes for wmemcmp.
4. Optimistically compare up to first 4 * CHAR_PER_VEC one at a
to check for early mismatches. Only do this if its guranteed the
to check for early mismatches. Only do this if its guaranteed the
work is not wasted.
5. If size is 8 * VEC_SIZE or less, unroll the loop.
6. Compare 4 * VEC_SIZE at a time with the aligned first memory
@ -90,7 +90,7 @@ Latency:
/* Warning!
wmemcmp has to use SIGNED comparison for elements.
memcmp has to use UNSIGNED comparison for elemnts.
memcmp has to use UNSIGNED comparison for elements.
*/
.section SECTION(.text), "ax", @progbits
@ -105,7 +105,7 @@ ENTRY_P2ALIGN (MEMCMP, 6)
/* Fall through for [0, VEC_SIZE] as its the hottest. */
ja L(more_1x_vec)
/* Create mask of bytes that are guranteed to be valid because
/* Create mask of bytes that are guaranteed to be valid because
of length (edx). Using masked movs allows us to skip checks
for page crosses/zero size. */
mov $-1, %VRAX
@ -365,7 +365,7 @@ L(loop_4x_vec):
/* Load regardless of branch. */
VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %VMM(3)
/* Seperate logic as we can only use testb for VEC_SIZE == 64.
/* Separate logic as we can only use testb for VEC_SIZE == 64.
*/
# if VEC_SIZE == 64
testb %dil, %dil

View File

@ -410,7 +410,7 @@ L(ret_nonzero_vec_start_4_5):
.p2align 4,, 8
L(ret_nonzero_vec_end_1):
pmovmskb %xmm1, %ecx
/* High 16 bits of eax guranteed to be all ones. Rotate them in
/* High 16 bits of eax guaranteed to be all ones. Rotate them in
to we can do `or + not` with just `xor`. */
rorl $16, %eax
xorl %ecx, %eax
@ -562,7 +562,7 @@ L(ret_nonzero_loop):
sall $(VEC_SIZE * 1), %edx
leal 1(%rcx, %rdx), %edx
pmovmskb %xmm2, %ecx
/* High 16 bits of eax guranteed to be all ones. Rotate them in
/* High 16 bits of eax guaranteed to be all ones. Rotate them in
to we can do `or + not` with just `xor`. */
rorl $16, %eax
xorl %ecx, %eax

View File

@ -26,7 +26,7 @@
and loading from either s1 or s2 would cause a page cross.
2. Use xmm vector compare when size >= 8 bytes.
3. Optimistically compare up to first 4 * VEC_SIZE one at a
to check for early mismatches. Only do this if its guranteed the
to check for early mismatches. Only do this if its guaranteed the
work is not wasted.
4. If size is 8 * VEC_SIZE or less, unroll the loop.
5. Compare 4 * VEC_SIZE at a time with the aligned first memory
@ -302,7 +302,7 @@ L(between_9_15):
movq -8(%rsi, %rdx), %rdi
subq %rdi, %rcx
orq %rcx, %rax
/* edx is guranteed to be a non-zero int. */
/* edx is guaranteed to be a non-zero int. */
cmovnz %edx, %eax
ret

View File

@ -26,7 +26,7 @@
and loading from either s1 or s2 would cause a page cross.
2. Use xmm vector compare when size >= 8 bytes.
3. Optimistically compare up to first 4 * VEC_SIZE one at a
to check for early mismatches. Only do this if its guranteed the
to check for early mismatches. Only do this if its guaranteed the
work is not wasted.
4. If size is 8 * VEC_SIZE or less, unroll the loop.
5. Compare 4 * VEC_SIZE at a time with the aligned first memory
@ -97,7 +97,7 @@ ENTRY_P2ALIGN (MEMCMPEQ, 6)
/* Fall through for [0, VEC_SIZE] as its the hottest. */
ja L(more_1x_vec)
/* Create mask of bytes that are guranteed to be valid because
/* Create mask of bytes that are guaranteed to be valid because
of length (edx). Using masked movs allows us to skip checks
for page crosses/zero size. */
mov $-1, %VRAX
@ -253,7 +253,7 @@ L(loop_4x_vec):
oring with VEC(4). Result is stored in VEC(4). */
vpternlogd $0xf6, (VEC_SIZE * 2)(%rdx), %VMM(3), %VMM(4)
/* Seperate logic as we can only use testb for VEC_SIZE == 64.
/* Separate logic as we can only use testb for VEC_SIZE == 64.
*/
# if VEC_SIZE == 64
testb %dil, %dil

View File

@ -231,7 +231,7 @@ L(end_loop_fwd):
movups %xmm7, 48(%r8)
ret
/* Extactly 64 bytes if `jmp L(end_loop_fwd)` is long encoding.
/* Exactly 64 bytes if `jmp L(end_loop_fwd)` is long encoding.
60 bytes otherwise. */
# define ALIGNED_LOOP_FWD(align_by); \
.p2align 6; \
@ -368,7 +368,7 @@ L(end_loop_bkwd):
ret
/* Extactly 64 bytes if `jmp L(end_loop_bkwd)` is long encoding.
/* Exactly 64 bytes if `jmp L(end_loop_bkwd)` is long encoding.
60 bytes otherwise. */
# define ALIGNED_LOOP_BKWD(align_by); \
.p2align 6; \

View File

@ -445,7 +445,7 @@ L(more_8x_vec_check):
shrq $63, %r8
/* Get 4k difference dst - src. */
andl $(PAGE_SIZE - 256), %ecx
/* If r8 is non-zero must do foward for correctness. Otherwise
/* If r8 is non-zero must do forward for correctness. Otherwise
if ecx is non-zero there is 4k False Alaising so do backward
copy. */
addl %r8d, %ecx
@ -460,7 +460,7 @@ L(more_8x_vec_forward):
/* First vec was already loaded into VEC(0). */
VMOVU -VEC_SIZE(%rsi, %rdx), %VMM(5)
VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(6)
/* Save begining of dst. */
/* Save beginning of dst. */
movq %rdi, %rcx
/* Align dst to VEC_SIZE - 1. */
orq $(VEC_SIZE - 1), %rdi
@ -517,7 +517,7 @@ L(more_8x_vec_backward):
/* First vec was also loaded into VEC(0). */
VMOVU VEC_SIZE(%rsi), %VMM(5)
VMOVU (VEC_SIZE * 2)(%rsi), %VMM(6)
/* Begining of region for 4x backward copy stored in rcx. */
/* Beginning of region for 4x backward copy stored in rcx. */
leaq (VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx
VMOVU (VEC_SIZE * 3)(%rsi), %VMM(7)
VMOVU -VEC_SIZE(%rsi, %rdx), %VMM(8)
@ -611,7 +611,7 @@ L(movsb):
movq %rdi, %r8
# endif
/* If above __x86_rep_movsb_stop_threshold most likely is
candidate for NT moves aswell. */
candidate for NT moves as well. */
cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
jae L(large_memcpy_2x_check)
# if AVOID_SHORT_DISTANCE_REP_MOVSB || ALIGN_MOVSB

View File

@ -65,7 +65,7 @@ ENTRY_P2ALIGN(MEMRCHR, 6)
L(ret_vec_x0_test):
/* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
will gurantee edx (len) is less than it. */
will guarantee edx (len) is less than it. */
lzcntl %ecx, %ecx
/* Hoist vzeroupper (not great for RTM) to save code size. This allows
@ -233,7 +233,7 @@ L(more_4x_vec):
jnz L(ret_vec_x3)
/* Check if near end before re-aligning (otherwise might do an
unnecissary loop iteration). */
unnecessary loop iteration). */
addq $-(VEC_SIZE * 4), %rax
cmpq $(VEC_SIZE * 4), %rdx
jbe L(last_4x_vec)

View File

@ -119,7 +119,7 @@ L(last_2x_vec):
# endif
jle L(zero_2)
/* We adjusted rax (length) for VEC_SIZE == 64 so need seperate
/* We adjusted rax (length) for VEC_SIZE == 64 so need separate
offsets. */
# if VEC_SIZE == 64
vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0
@ -354,7 +354,7 @@ L(loop_4x_vec):
jnz L(first_vec_x1_end)
KMOV %k2, %VRCX
/* Seperate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
/* Separate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
individually, for VEC_SIZE == 32 we combine them in a single
64-bit GPR. */

View File

@ -50,7 +50,7 @@ ENTRY_P2ALIGN(MEMRCHR, 6)
jz L(page_cross)
/* NB: This load happens regardless of whether rdx (len) is zero. Since
it doesn't cross a page and the standard gurantees any pointer have
it doesn't cross a page and the standard guarantees any pointer have
at least one-valid byte this load must be safe. For the entire
history of the x86 memrchr implementation this has been possible so
no code "should" be relying on a zero-length check before this load.

View File

@ -199,7 +199,7 @@ L(less_vec_from_wmemset):
MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out. */
andl $(PAGE_SIZE - 1), %edi
/* Check if VEC_SIZE store cross page. Mask stores suffer
serious performance degradation when it has to fault supress.
serious performance degradation when it has to fault suppress.
*/
cmpl $(PAGE_SIZE - VEC_SIZE), %edi
/* This is generally considered a cold target. */

View File

@ -187,13 +187,13 @@ L(loop_4x_vec):
ymm0-15 is used at all is because there is no EVEX encoding
vpcmpeq and with vpcmpeq this loop can be performed more
efficiently. The non-vzeroupper version is safe for RTM
while the vzeroupper version should be prefered if RTM are
while the vzeroupper version should be preferred if RTM are
not supported. Which loop version we use is determined by
USE_TERN_IN_LOOP. */
# if USE_TERN_IN_LOOP
/* Since vptern can only take 3x vectors fastest to do 1 vec
seperately with EVEX vpcmp. */
separately with EVEX vpcmp. */
VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k1
/* Compare 3x with vpcmpeq and or them all together with vptern.
*/
@ -256,7 +256,7 @@ L(loop_4x_vec):
(only if used VEX encoded loop). */
COND_VZEROUPPER
/* Seperate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
/* Separate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
individually, for VEC_SIZE == 32 we combine them in a single
64-bit GPR. */

View File

@ -163,7 +163,7 @@ ENTRY (STRCAT)
decl %ecx
jnz 21b
/* Now the sources is aligned. Unfortunatly we cannot force
/* Now the sources is aligned. Unfortunately we cannot force
to have both source and destination aligned, so ignore the
alignment of the destination. */
.p2align 4

View File

@ -1,4 +1,4 @@
/* strlen used for begining of str{n}cat using AVX2.
/* strlen used for beginning of str{n}cat using AVX2.
Copyright (C) 2011-2023 Free Software Foundation, Inc.
This file is part of the GNU C Library.

View File

@ -1,4 +1,4 @@
/* strlen used for begining of str{n}cat using EVEX 256/512.
/* strlen used for beginning of str{n}cat using EVEX 256/512.
Copyright (C) 2011-2023 Free Software Foundation, Inc.
This file is part of the GNU C Library.

View File

@ -160,7 +160,7 @@ L(last_vec_x2):
# endif
L(first_vec_x1):
/* Use bsf here to save 1-byte keeping keeping the block in 1x
fetch block. eax guranteed non-zero. */
fetch block. eax guaranteed non-zero. */
bsf %VRCX, %VRCX
# ifndef USE_AS_STRCHRNUL
/* Found CHAR or the null byte. */
@ -294,7 +294,7 @@ L(loop_4x_vec):
/* Two methods for loop depending on VEC_SIZE. This is because
with zmm registers VPMINU can only run on p0 (as opposed to
p0/p1 for ymm) so it is less prefered. */
p0/p1 for ymm) so it is less preferred. */
# if VEC_SIZE == 32
/* For VEC_2 and VEC_3 use xor to set the CHARs matching esi to
zero. */
@ -340,7 +340,7 @@ L(loop_4x_vec):
esi, the corresponding bit in %k3 is zero so the
VPMINU_MASKZ will have a zero in the result). NB: This make
the VPMINU 3c latency. The only way to avoid it is to
createa a 12c dependency chain on all the `VPCMP $4, ...`
create a 12c dependency chain on all the `VPCMP $4, ...`
which has higher total latency. */
VPMINU %VMM(2), %VMM(4), %VMM(4){%k3}{z}
# endif
@ -366,7 +366,7 @@ L(loop_4x_vec):
# endif
/* COND_MASK integates the esi matches for VEC_SIZE == 64. For
/* COND_MASK integrates the esi matches for VEC_SIZE == 64. For
VEC_SIZE == 32 they are already integrated. */
VPTEST %VMM(2), %VMM(2), %k0 COND_MASK(k2)
KMOV %k0, %VRCX
@ -403,7 +403,7 @@ L(zero_end):
# endif
/* Seperate return label for last VEC1 because for VEC_SIZE ==
/* Separate return label for last VEC1 because for VEC_SIZE ==
32 we can reuse return code in L(page_cross) but VEC_SIZE ==
64 has mismatched registers. */
# if VEC_SIZE == 64
@ -480,7 +480,7 @@ L(cross_page_boundary_real):
*/
xorl $((1 << CHAR_PER_VEC)- 1), %eax
# endif
/* Use arithmatic shift so that leading 1s are filled in. */
/* Use arithmetic shift so that leading 1s are filled in. */
sarx %VGPR(SHIFT_REG), %VRAX, %VRAX
/* If eax is all ones then no matches for esi or NULL. */

View File

@ -86,7 +86,7 @@ L(next_48_bytes):
jne L(return)
L(loop_start):
/* We use this alignment to force loop be aligned to 8 but not
16 bytes. This gives better sheduling on AMD processors. */
16 bytes. This gives better scheduling on AMD processors. */
.p2align 4
pxor %xmm6, %xmm6
andq $-64, %rdi

View File

@ -194,7 +194,7 @@ ENTRY (STRCASECMP)
movq __libc_tsd_LOCALE@gottpoff(%rip), %rax
mov %fs:(%rax), %LOCALE_REG_LP
/* Either 1 or 5 bytes (dependeing if CET is enabled). */
/* Either 1 or 5 bytes (depending if CET is enabled). */
.p2align 4
END (STRCASECMP)
/* FALLTHROUGH to strcasecmp/strncasecmp_l. */
@ -501,7 +501,7 @@ L(more_3x_vec):
L(prepare_loop):
# ifdef USE_AS_STRNCMP
/* Store N + (VEC_SIZE * 4) and place check at the begining of
/* Store N + (VEC_SIZE * 4) and place check at the beginning of
the loop. */
leaq (VEC_SIZE * 2)(%rdi, %rdx), %rdx
# endif
@ -762,7 +762,7 @@ L(page_cross_during_loop):
.p2align 4,, 4
L(less_1x_vec_till_page_cross):
subl $-(VEC_SIZE * 4), %eax
/* Guranteed safe to read from rdi - VEC_SIZE here. The only
/* Guaranteed safe to read from rdi - VEC_SIZE here. The only
concerning case is first iteration if incoming s1 was near start
of a page and s2 near end. If s1 was near the start of the page
we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe
@ -948,7 +948,7 @@ L(ret9):
L(page_cross):
# ifndef USE_AS_STRNCMP
/* If both are VEC aligned we don't need any special logic here.
Only valid for strcmp where stop condition is guranteed to be
Only valid for strcmp where stop condition is guaranteed to be
reachable by just reading memory. */
testl $((VEC_SIZE - 1) << 20), %eax
jz L(no_page_cross)
@ -984,7 +984,7 @@ L(page_cross):
subl $(VEC_SIZE * 3), %eax
jg L(less_1x_vec_till_page)
/* If more than 1x VEC till page cross, loop throuh safely
/* If more than 1x VEC till page cross, loop through safely
loadable memory until within 1x VEC of page cross. */
.p2align 4,, 10
@ -1007,9 +1007,9 @@ L(page_cross_loop):
jl L(page_cross_loop)
subl %eax, %OFFSET_REG
/* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
/* OFFSET_REG has distance to page cross - VEC_SIZE. Guaranteed
to not cross page so is safe to load. Since we have already
loaded at least 1 VEC from rsi it is also guranteed to be
loaded at least 1 VEC from rsi it is also guaranteed to be
safe. */
VMOVU (%rdi, %OFFSET_REG64), %ymm0

View File

@ -217,7 +217,7 @@ ENTRY (STRCASECMP)
movq __libc_tsd_LOCALE@gottpoff(%rip), %rax
mov %fs:(%rax), %LOCALE_REG_LP
/* Either 1 or 5 bytes (dependeing if CET is enabled). */
/* Either 1 or 5 bytes (depending if CET is enabled). */
.p2align 4
END (STRCASECMP)
/* FALLTHROUGH to strcasecmp/strncasecmp_l. */
@ -455,7 +455,7 @@ L(return_vec_3):
# endif
/* If CHAR_PER_VEC == 64 we can't combine matches from the last
2x VEC so need seperate return label. */
2x VEC so need separate return label. */
L(return_vec_2):
# if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
bsf %VRCX, %VRCX
@ -567,7 +567,7 @@ L(prepare_loop_no_len):
shrl $2, %ecx
leaq (CHAR_PER_VEC * 2)(%rdx, %rcx), %rdx
# else
/* Store N + (VEC_SIZE * 4) and place check at the begining of
/* Store N + (VEC_SIZE * 4) and place check at the beginning of
the loop. */
leaq (VEC_SIZE * 2)(%rdi, %rdx), %rdx
L(prepare_loop_no_len):
@ -840,7 +840,7 @@ L(ret7):
/* If CHAR_PER_VEC == 64 we can't combine matches from the last
2x VEC so need seperate return label. */
2x VEC so need separate return label. */
# if CHAR_PER_VEC == 64
L(return_vec_2_end):
bsf %VRCX, %VRCX
@ -906,7 +906,7 @@ L(page_cross_during_loop):
.p2align 4,, 4
L(less_1x_vec_till_page_cross):
subl $-(VEC_SIZE * 4), %eax
/* Guranteed safe to read from rdi - VEC_SIZE here. The only
/* Guaranteed safe to read from rdi - VEC_SIZE here. The only
concerning case is first iteration if incoming s1 was near start
of a page and s2 near end. If s1 was near the start of the page
we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe
@ -997,7 +997,7 @@ L(return_page_cross_end_check):
and %VR10, %VRCX
/* Need to use tzcnt here as VRCX may be zero. If VRCX is zero
tzcnt(VRCX) will be CHAR_PER and remaining length (edx) is
guranteed to be <= CHAR_PER_VEC so we will only use the return
guaranteed to be <= CHAR_PER_VEC so we will only use the return
idx if VRCX was non-zero. */
tzcnt %VRCX, %VRCX
leal -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
@ -1147,7 +1147,7 @@ L(ret9):
L(page_cross):
# ifndef USE_AS_STRNCMP
/* If both are VEC aligned we don't need any special logic here.
Only valid for strcmp where stop condition is guranteed to
Only valid for strcmp where stop condition is guaranteed to
be reachable by just reading memory. */
testl $((VEC_SIZE - 1) << 20), %eax
jz L(no_page_cross)
@ -1185,7 +1185,7 @@ L(page_cross):
jg L(less_1x_vec_till_page)
/* If more than 1x VEC till page cross, loop throuh safely
/* If more than 1x VEC till page cross, loop through safely
loadable memory until within 1x VEC of page cross. */
.p2align 4,, 8
L(page_cross_loop):
@ -1209,9 +1209,9 @@ L(page_cross_loop):
subl %eax, %OFFSET_REG
/* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
/* OFFSET_REG has distance to page cross - VEC_SIZE. Guaranteed
to not cross page so is safe to load. Since we have already
loaded at least 1 VEC from rsi it is also guranteed to be
loaded at least 1 VEC from rsi it is also guaranteed to be
safe. */
VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(0)
VPTESTM %VMM(0), %VMM(0), %k2

View File

@ -20,7 +20,7 @@
/* Continue building as ISA level 2. We use this as ISA V2 default
because strcmp-sse42 uses pcmpstri (slow on some SSE4.2
processors) and this implementation is potenially faster than
processors) and this implementation is potentially faster than
strcmp-sse42 (aside from the slower page cross case). */
#if ISA_SHOULD_BUILD (2)

View File

@ -75,7 +75,7 @@ ENTRY2 (STRCASECMP)
movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
mov %fs:(%rax),%RDX_LP
/* Either 1 or 5 bytes (dependeing if CET is enabled). */
/* Either 1 or 5 bytes (depending if CET is enabled). */
.p2align 4
END2 (STRCASECMP)
/* FALLTHROUGH to strcasecmp_l. */
@ -89,7 +89,7 @@ ENTRY2 (STRCASECMP)
movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
mov %fs:(%rax),%RCX_LP
/* Either 1 or 5 bytes (dependeing if CET is enabled). */
/* Either 1 or 5 bytes (depending if CET is enabled). */
.p2align 4
END2 (STRCASECMP)
/* FALLTHROUGH to strncasecmp_l. */
@ -186,7 +186,7 @@ ENTRY (STRCMP)
jnz LABEL(less16bytes) /* If not, find different value or null char */
# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
sub $16, %r11
jbe LABEL(strcmp_exitz) /* finish comparision */
jbe LABEL(strcmp_exitz) /* finish comparison */
# endif
add $16, %rsi /* prepare to search next 16 bytes */
add $16, %rdi /* prepare to search next 16 bytes */
@ -400,7 +400,7 @@ LABEL(nibble_ashr_1):
# endif
pxor %xmm0, %xmm0
sub $0x1000, %r10 /* substract 4K from %r10 */
sub $0x1000, %r10 /* subtract 4K from %r10 */
jmp LABEL(gobble_ashr_1)
/*

View File

@ -84,7 +84,7 @@ ENTRY (STRCASECMP)
movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
mov %fs:(%rax),%RDX_LP
/* Either 1 or 5 bytes (dependeing if CET is enabled). */
/* Either 1 or 5 bytes (depending if CET is enabled). */
.p2align 4
END (STRCASECMP)
/* FALLTHROUGH to strcasecmp_l. */
@ -94,7 +94,7 @@ ENTRY (STRCASECMP)
movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
mov %fs:(%rax),%RCX_LP
/* Either 1 or 5 bytes (dependeing if CET is enabled). */
/* Either 1 or 5 bytes (depending if CET is enabled). */
.p2align 4
END (STRCASECMP)
/* FALLTHROUGH to strncasecmp_l. */

View File

@ -50,7 +50,7 @@ ENTRY (STRCPY)
5:
movq $0xfefefefefefefeff,%r8
/* Now the sources is aligned. Unfortunatly we cannot force
/* Now the sources is aligned. Unfortunately we cannot force
to have both source and destination aligned, so ignore the
alignment of the destination. */
.p2align 4

View File

@ -224,7 +224,7 @@ L(cross_page_continue):
since data is only aligned to VEC_SIZE. */
# ifdef USE_AS_STRNLEN
/* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
because it simplies the logic in last_4x_vec_or_less. */
because it simplifies the logic in last_4x_vec_or_less. */
leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
subq %rdx, %rcx
# ifdef USE_AS_WCSLEN

View File

@ -236,7 +236,7 @@ L(more_1x_vec):
VMOVU %VMM(0), (%rdi)
/* We are going to align rsi here so will need to be able to re-
adjust rdi/rdx afterwords. NB: We filtered out huge lengths
adjust rdi/rdx afterwards. NB: We filtered out huge lengths
so rsi + rdx * CHAR_SIZE cannot overflow. */
leaq (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx

View File

@ -99,7 +99,7 @@ L(page_cross_continue):
/* `jb` because length rdx is now length - CHAR_SIZE. */
jbe L(less_1x_vec)
/* This may overset but thats fine because we still need to zero
/* This may overset but that's fine because we still need to zero
fill. */
VMOVU %VMM(0), (%rdi)

View File

@ -130,7 +130,7 @@ L(page_cross_continue):
jae L(more_1x_vec)
/* If there where multiple zero-CHAR matches in the first VEC,
VRCX will be overset but thats fine since any oversets where
VRCX will be overset but that's fine since any oversets where
at zero-positions anyways. */
# ifdef USE_AS_STPCPY
@ -177,7 +177,7 @@ L(more_1x_vec):
# endif
/* This may overset but thats fine because we still need to zero
/* This may overset but that's fine because we still need to zero
fill. */
VMOVU %VMM(0), (%rdi)
@ -189,7 +189,7 @@ L(more_1x_vec):
/* We are going to align rsi here so will need to be able to re-
adjust rdi/rdx afterwords. NB: We filtered out huge lengths
adjust rdi/rdx afterwards. NB: We filtered out huge lengths
so rsi + rdx * CHAR_SIZE cannot overflow. */
leaq (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
subq %rsi, %rdi
@ -221,7 +221,7 @@ L(last_2x_vec):
cmpl $(CHAR_PER_VEC), %edx
jb L(ret_vec_x1_len)
/* Seperate logic for CHAR_PER_VEC == 64 because we already did
/* Separate logic for CHAR_PER_VEC == 64 because we already did
`tzcnt` on VRCX. */
# if CHAR_PER_VEC == 64
/* cl == CHAR_PER_VEC iff it was zero before the `tzcnt`. */
@ -296,7 +296,7 @@ L(ret_vec_x1_no_bsf):
.p2align 4,, 8
L(last_4x_vec):
/* Seperate logic for CHAR_PER_VEC == 64 because we can do `andl
/* Separate logic for CHAR_PER_VEC == 64 because we can do `andl
$(CHAR_PER_VEC * 4 - 1), %edx` with less code size just
using `movzbl`. */
# if CHAR_PER_VEC == 64
@ -677,7 +677,7 @@ L(copy_16_31):
vmovdqu %xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
cmpl %ecx, %edx
/* Seperate logic depending on VEC_SIZE. If VEC_SIZE == 64 then
/* Separate logic depending on VEC_SIZE. If VEC_SIZE == 64 then
we have a larger copy block for 32-63 so this is just falls
through to zfill 16-31. If VEC_SIZE == 32 then we check for
full zfill of less 1x VEC. */

View File

@ -336,7 +336,7 @@ L(loop_last_4x_vec):
VPTESTN %VMM(3), %VMM(3), %k0
/* Seperate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
/* Separate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
individually, for VEC_SIZE == 32 we combine them in a single
64-bit GPR. */

View File

@ -176,7 +176,7 @@ L(aligned_more):
.p2align 4
L(first_aligned_loop):
/* Do 2x VEC at a time. Any more and the cost of finding the
match outweights loop benefit. */
match outweighs loop benefit. */
vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4
vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5
@ -324,7 +324,7 @@ L(cross_page):
vmovdqu (%rsi), %ymm1
VPCMPEQ %ymm1, %ymm0, %ymm6
vpmovmskb %ymm6, %ecx
/* Shift out zero CHAR matches that are before the begining of
/* Shift out zero CHAR matches that are before the beginning of
src (rdi). */
shrxl %edi, %ecx, %ecx
testl %ecx, %ecx
@ -332,7 +332,7 @@ L(cross_page):
VPCMPEQ %ymm1, %ymm7, %ymm1
vpmovmskb %ymm1, %eax
/* Shift out search CHAR matches that are before the begining of
/* Shift out search CHAR matches that are before the beginning of
src (rdi). */
shrxl %edi, %eax, %eax
blsmskl %ecx, %ecx

View File

@ -152,7 +152,7 @@ L(loop):
jnz L(loop_vec_x2_match)
KMOV %k1, %VRDX
/* Match is in first vector, rdi offset need to be substracted
/* Match is in first vector, rdi offset need to be subtracted
by VEC_SIZE. */
sub $VEC_SIZE, %r8
@ -216,7 +216,7 @@ L(check_last_match):
ret
/* No match recorded in r8. Check the second saved vector
in begining. */
in beginning. */
L(vector_x2_ret):
VPCMPEQ %VMM(2), %VMM(0), %k2
KMOV %k2, %VRAX

View File

@ -139,7 +139,7 @@ L(first_vec_x1_or_x2):
KORTEST %k2, %k3
jz L(first_vec_x0_test)
/* Guranteed that VEC(2) and VEC(3) are within range so merge
/* Guaranteed that VEC(2) and VEC(3) are within range so merge
the two bitmasks then get last result. */
kunpck_2x %k2, %k3, %k3
kmov_2x %k3, %maskm_2x
@ -192,7 +192,7 @@ L(first_vec_x2):
.p2align 4,, 12
L(aligned_more):
/* Need to keep original pointer incase VEC(1) has last match.
/* Need to keep original pointer in case VEC(1) has last match.
*/
movq %rdi, %rsi
andq $-VEC_SIZE, %rdi
@ -222,7 +222,7 @@ L(aligned_more):
.p2align 4,, 10
L(first_aligned_loop):
/* Preserve VEC(1), VEC(2), VEC(3), and VEC(4) until we can
gurantee they don't store a match. */
guarantee they don't store a match. */
VMOVA (VEC_SIZE * 4)(%rdi), %VMM(5)
VMOVA (VEC_SIZE * 5)(%rdi), %VMM(6)
@ -285,7 +285,7 @@ L(second_aligned_loop_prep):
L(second_aligned_loop_set_furthest_match):
movq %rdi, %rsi
/* Ideally we would safe k2/k3 but `kmov/kunpck` take uops on
port0 and have noticable overhead in the loop. */
port0 and have noticeable overhead in the loop. */
VMOVA %VMM(5), %VMM(7)
VMOVA %VMM(6), %VMM(8)
.p2align 4
@ -351,7 +351,7 @@ L(cross_page_boundary):
/* eax contains all the page offset bits of src (rdi). `xor rdi,
rax` sets pointer will all page offset bits cleared so
offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
before page cross (guranteed to be safe to read). Doing this
before page cross (guaranteed to be safe to read). Doing this
as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
a bit of code size. */
xorq %rdi, %rax
@ -359,7 +359,7 @@ L(cross_page_boundary):
VPTESTN %VMM(1), %VMM(1), %k0
KMOV %k0, %VRCX
/* Shift out zero CHAR matches that are before the begining of
/* Shift out zero CHAR matches that are before the beginning of
src (rdi). */
# ifdef USE_AS_WCSRCHR
movl %edi, %esi
@ -374,7 +374,7 @@ L(cross_page_boundary):
/* Found zero CHAR so need to test for search CHAR. */
VPCMP $0, %VMATCH, %VMM(1), %k1
KMOV %k1, %VRAX
/* Shift out search CHAR matches that are before the begining of
/* Shift out search CHAR matches that are before the beginning of
src (rdi). */
shrx %VGPR(SHIFT_REG), %VRAX, %VRAX

View File

@ -166,7 +166,7 @@ L(first_loop):
/* Do 2x VEC at a time. */
movaps (VEC_SIZE * 2)(%rdi), %xmm4
movaps (VEC_SIZE * 3)(%rdi), %xmm5
/* Since SSE2 no pminud so wcsrchr needs seperate logic for
/* Since SSE2 no pminud so wcsrchr needs separate logic for
detecting zero. Note if this is found to be a bottleneck it
may be worth adding an SSE4.1 wcsrchr implementation. */
# ifdef USE_AS_WCSRCHR
@ -238,7 +238,7 @@ L(new_match):
/* We can't reuse either of the old comparisons as since we mask
of zeros after first zero (instead of using the full
comparison) we can't gurantee no interference between match
comparison) we can't guarantee no interference between match
after end of string and valid match. */
pmovmskb %xmm4, %eax
pmovmskb %xmm7, %edx
@ -268,7 +268,7 @@ L(second_loop_match):
L(second_loop):
movaps (VEC_SIZE * 2)(%rdi), %xmm4
movaps (VEC_SIZE * 3)(%rdi), %xmm5
/* Since SSE2 no pminud so wcsrchr needs seperate logic for
/* Since SSE2 no pminud so wcsrchr needs separate logic for
detecting zero. Note if this is found to be a bottleneck it
may be worth adding an SSE4.1 wcsrchr implementation. */
# ifdef USE_AS_WCSRCHR
@ -297,11 +297,11 @@ L(second_loop):
pmovmskb %xmm6, %eax
addq $(VEC_SIZE * 2), %rdi
/* Either null term or new occurence of CHAR. */
/* Either null term or new occurrence of CHAR. */
addl %ecx, %eax
jz L(second_loop)
/* No null term so much be new occurence of CHAR. */
/* No null term so much be new occurrence of CHAR. */
testl %ecx, %ecx
jz L(second_loop_match)
@ -331,7 +331,7 @@ L(second_loop_new_match):
/* We can't reuse either of the old comparisons as since we mask
of zeros after first zero (instead of using the full
comparison) we can't gurantee no interference between match
comparison) we can't guarantee no interference between match
after end of string and valid match. */
pmovmskb %xmm4, %eax
pmovmskb %xmm7, %edx

View File

@ -140,7 +140,7 @@ __strstr_avx512 (const char *haystack, const char *ned)
= cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask, hay0, hay0));
uint64_t cmpmask = nullmask ^ (nullmask - ONE_64BIT);
cmpmask = cmpmask & cvtmask64_u64 (loadmask);
/* Search for the 2 charaters of needle */
/* Search for the 2 characters of needle */
__mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
__mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
k1 = kshiftri_mask64 (k1, 1);