x86: Add support for building {w}memcmp{eq} with explicit ISA level

1. Refactor files so that all implementations are in the multiarch
   directory
    - Moved the implementation portion of memcmp sse2 from memcmp.S to
      multiarch/memcmp-sse2.S

    - The non-multiarch file now only includes one of the
      implementations in the multiarch directory based on the compiled
      ISA level (only used for non-multiarch builds.  Otherwise we go
      through the ifunc selector).

2. Add ISA level build guards to different implementations.
    - I.e memcmp-avx2-movsb.S which is ISA level 3 will only build if
      compiled ISA level <= 3. Otherwise there is no reason to include
      it as we will always use one of the ISA level 4
      implementations (memcmp-evex-movbe.S).

3. Add new multiarch/rtld-{w}memcmp{eq}.S that just include the
   non-multiarch {w}memcmp{eq}.S which will in turn select the best
   implementation based on the compiled ISA level.

4. Refactor the ifunc selector and ifunc implementation list to use
   the ISA level aware wrapper macros that allow functions below the
   compiled ISA level (with a guranteed replacement) to be skipped.

Tested with and without multiarch on x86_64 for ISA levels:
{generic, x86-64-v2, x86-64-v3, x86-64-v4}

And m32 with and without multiarch.
This commit is contained in:
Noah Goldstein 2022-06-29 16:11:22 -07:00
parent 37ecc657b2
commit ae308947ff
20 changed files with 791 additions and 662 deletions

View File

@ -30,7 +30,8 @@
since IFUNC must be set up by init_cpu_features. */
# if defined USE_MULTIARCH && !defined SHARED
# ifdef __x86_64__
# define DEFAULT_MEMCMP __memcmp_sse2
/* DEFAULT_MEMCMP by sysdeps/x86_64/memcmp-isa-default-impl.h. */
# include <sysdeps/x86_64/memcmp-isa-default-impl.h>
# else
# define DEFAULT_MEMCMP __memcmp_ia32
# endif

View File

@ -0,0 +1,28 @@
/* Set default memcmp impl based on ISA level.
Copyright (C) 2022 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <isa-level.h>
#if MINIMUM_X86_ISA_LEVEL == 1 || MINIMUM_X86_ISA_LEVEL == 2
# define DEFAULT_MEMCMP __memcmp_sse2
#elif MINIMUM_X86_ISA_LEVEL == 3
# define DEFAULT_MEMCMP __memcmp_avx2_movbe
#elif MINIMUM_X86_ISA_LEVEL == 4
# define DEFAULT_MEMCMP __memcmp_evex_movbe
#else
# error "Unknown default memcmp implementation"
#endif

View File

@ -1,4 +1,4 @@
/* memcmp with SSE2
/* memcmp hook for non-multiarch and RTLD build.
Copyright (C) 2009-2022 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@ -16,563 +16,13 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#define MEMCMP memcmp
#ifdef USE_AS_WMEMCMP
# define PCMPEQ pcmpeqd
# define CHAR_SIZE 4
# define SIZE_OFFSET (0)
#else
# define PCMPEQ pcmpeqb
# define CHAR_SIZE 1
#endif
#define DEFAULT_IMPL_V1 "multiarch/memcmp-sse2.S"
#define DEFAULT_IMPL_V3 "multiarch/memcmp-avx2-movbe.S"
#define DEFAULT_IMPL_V4 "multiarch/memcmp-evex-movbe.S"
#ifdef USE_AS_MEMCMPEQ
# define SIZE_OFFSET (0)
# define CHECK_CMP(x, y) subl x, y
#else
# ifndef SIZE_OFFSET
# define SIZE_OFFSET (CHAR_PER_VEC * 2)
# endif
# define CHECK_CMP(x, y) cmpl x, y
#endif
#include "isa-default-impl.h"
#define VEC_SIZE 16
#define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
#ifndef MEMCMP
# define MEMCMP memcmp
#endif
.text
ENTRY(MEMCMP)
# ifdef __ILP32__
/* Clear the upper 32 bits. */
movl %edx, %edx
# endif
#ifdef USE_AS_WMEMCMP
/* Use 0xffff to test for mismatches on pmovmskb bitmask. Store
in ecx for code size. This is preferable to using `incw` as
it avoids partial register stalls on older hardware (pre
SnB). */
movl $0xffff, %ecx
#endif
cmpq $CHAR_PER_VEC, %rdx
ja L(more_1x_vec)
#ifdef USE_AS_WMEMCMP
/* saves a byte of code keeping the fall through path n = [2, 4]
in the initial cache line. */
decl %edx
jle L(cmp_0_1)
movq (%rsi), %xmm0
movq (%rdi), %xmm1
PCMPEQ %xmm0, %xmm1
pmovmskb %xmm1, %eax
subl %ecx, %eax
jnz L(ret_nonzero_vec_start_0)
movq -4(%rsi, %rdx, CHAR_SIZE), %xmm0
movq -4(%rdi, %rdx, CHAR_SIZE), %xmm1
PCMPEQ %xmm0, %xmm1
pmovmskb %xmm1, %eax
subl %ecx, %eax
jnz L(ret_nonzero_vec_end_0_adj)
#else
cmpl $8, %edx
ja L(cmp_9_16)
cmpl $4, %edx
jb L(cmp_0_3)
# ifdef USE_AS_MEMCMPEQ
movl (%rsi), %eax
subl (%rdi), %eax
movl -4(%rsi, %rdx), %esi
subl -4(%rdi, %rdx), %esi
orl %esi, %eax
ret
# else
/* Combine comparisons for lo and hi 4-byte comparisons. */
movl -4(%rsi, %rdx), %ecx
movl -4(%rdi, %rdx), %eax
shlq $32, %rcx
shlq $32, %rax
movl (%rsi), %esi
movl (%rdi), %edi
orq %rsi, %rcx
orq %rdi, %rax
/* Only compute proper return if not-equal. */
cmpq %rcx, %rax
jnz L(ret_nonzero)
xorl %eax, %eax
ret
# endif
.p2align 4,, 10
L(cmp_9_16):
# ifdef USE_AS_MEMCMPEQ
movq (%rsi), %rax
subq (%rdi), %rax
movq -8(%rsi, %rdx), %rcx
subq -8(%rdi, %rdx), %rcx
orq %rcx, %rax
/* Convert 64 bit -> 32 bit boolean (we should have made the ABI
return long). */
setnz %cl
movzbl %cl, %eax
# else
movq (%rsi), %rcx
movq (%rdi), %rax
/* Only compute proper return if not-equal. */
cmpq %rcx, %rax
jnz L(ret_nonzero)
movq -8(%rsi, %rdx, CHAR_SIZE), %rcx
movq -8(%rdi, %rdx, CHAR_SIZE), %rax
/* Only compute proper return if not-equal. */
cmpq %rcx, %rax
jnz L(ret_nonzero)
xorl %eax, %eax
# endif
#endif
ret
.p2align 4,, 8
L(cmp_0_1):
/* Flag set by earlier comparison against 1. */
jne L(cmp_0_0)
#ifdef USE_AS_WMEMCMP
movl (%rdi), %ecx
xorl %edx, %edx
cmpl (%rsi), %ecx
je L(cmp_0_0)
setg %dl
leal -1(%rdx, %rdx), %eax
#else
movzbl (%rdi), %eax
movzbl (%rsi), %ecx
subl %ecx, %eax
#endif
ret
/* Fits in aligning bytes. */
L(cmp_0_0):
xorl %eax, %eax
ret
#ifdef USE_AS_WMEMCMP
.p2align 4
L(ret_nonzero_vec_start_0):
bsfl %eax, %eax
movl (%rdi, %rax), %ecx
xorl %edx, %edx
cmpl (%rsi, %rax), %ecx
/* NB: no partial register stall here because xorl zero idiom
above. */
setg %dl
leal -1(%rdx, %rdx), %eax
ret
#else
# ifndef USE_AS_MEMCMPEQ
.p2align 4,, 14
L(ret_nonzero):
/* Need to bswap to get proper return without branch. */
bswapq %rcx
bswapq %rax
subq %rcx, %rax
sbbl %eax, %eax
orl $1, %eax
ret
# endif
.p2align 4
L(cmp_0_3):
# ifdef USE_AS_MEMCMPEQ
/* No reason to add to dependency chain on rdx. Saving a the
bytes here doesn't change number of fetch blocks. */
cmpl $1, %edx
jbe L(cmp_0_1)
# else
/* We need the code size to prevent taking an extra fetch block.
*/
decl %edx
jle L(cmp_0_1)
# endif
movzwl (%rsi), %ecx
movzwl (%rdi), %eax
# ifdef USE_AS_MEMCMPEQ
subl %ecx, %eax
movzbl -1(%rsi, %rdx), %esi
movzbl -1(%rdi, %rdx), %edi
subl %edi, %esi
orl %esi, %eax
# else
bswapl %ecx
bswapl %eax
/* Implicit right shift by one. We just need to displace the
sign bits. */
shrl %ecx
shrl %eax
/* Eat a partial register stall here. Saves code stopping
L(cmp_0_3) from bleeding into the next fetch block and saves
an ALU. */
movb (%rsi, %rdx), %cl
movzbl (%rdi, %rdx), %edi
orl %edi, %eax
subl %ecx, %eax
# endif
ret
#endif
.p2align 5
L(more_1x_vec):
#ifndef USE_AS_WMEMCMP
/* Use 0xffff to test for mismatches on pmovmskb bitmask. Store
in ecx for code size. This is preferable to using `incw` as
it avoids partial register stalls on older hardware (pre
SnB). */
movl $0xffff, %ecx
#endif
movups (%rsi), %xmm0
movups (%rdi), %xmm1
PCMPEQ %xmm0, %xmm1
pmovmskb %xmm1, %eax
subl %ecx, %eax
jnz L(ret_nonzero_vec_start_0)
#if SIZE_OFFSET == 0
cmpq $(CHAR_PER_VEC * 2), %rdx
#else
/* Offset rdx. Saves just enough code size to keep the
L(last_2x_vec) case and the non-zero return in a single
cache line. */
subq $(CHAR_PER_VEC * 2), %rdx
#endif
ja L(more_2x_vec)
movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
PCMPEQ %xmm0, %xmm1
pmovmskb %xmm1, %eax
subl %ecx, %eax
#ifndef USE_AS_MEMCMPEQ
/* Don't use `incw ax` as machines this code runs on are liable
to have partial register stall. */
jnz L(ret_nonzero_vec_end_0)
#else
/* Various return targets for memcmpeq. Will always be hot in
Icache and get short encoding. */
L(ret_nonzero_vec_start_1):
L(ret_nonzero_vec_start_0):
L(ret_nonzero_vec_end_0):
#endif
ret
#ifndef USE_AS_MEMCMPEQ
# ifdef USE_AS_WMEMCMP
.p2align 4
L(ret_nonzero_vec_end_0_adj):
addl $3, %edx
# else
.p2align 4,, 8
# endif
L(ret_nonzero_vec_end_0):
bsfl %eax, %eax
# ifdef USE_AS_WMEMCMP
leal (%rax, %rdx, CHAR_SIZE), %eax
movl (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %ecx
xorl %edx, %edx
cmpl (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx
/* NB: no partial register stall here because xorl zero idiom
above. */
setg %dl
leal -1(%rdx, %rdx), %eax
# else
addl %edx, %eax
movzbl (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx
movzbl (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %eax
subl %ecx, %eax
# endif
ret
# ifndef USE_AS_WMEMCMP
.p2align 4,, 10
L(ret_nonzero_vec_start_0):
bsfl %eax, %eax
movzbl (%rsi, %rax), %ecx
movzbl (%rdi, %rax), %eax
subl %ecx, %eax
ret
# endif
#else
#endif
.p2align 5
L(more_2x_vec):
movups (VEC_SIZE * 1)(%rsi), %xmm0
movups (VEC_SIZE * 1)(%rdi), %xmm1
PCMPEQ %xmm0, %xmm1
pmovmskb %xmm1, %eax
subl %ecx, %eax
jnz L(ret_nonzero_vec_start_1)
cmpq $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %rdx
jbe L(last_2x_vec)
cmpq $(CHAR_PER_VEC * 8 - SIZE_OFFSET), %rdx
ja L(more_8x_vec)
/* Do comparisons for [65, 96] and [97, 128] 2x VEC at a time.
This can harm performance if non-zero return in [65, 80] or
[97, 112] but helps performance otherwise. Generally zero-
return is hotter. */
movups (VEC_SIZE * 2)(%rsi), %xmm0
movups (VEC_SIZE * 2)(%rdi), %xmm1
PCMPEQ %xmm0, %xmm1
movups (VEC_SIZE * 3)(%rsi), %xmm2
movups (VEC_SIZE * 3)(%rdi), %xmm3
PCMPEQ %xmm2, %xmm3
pand %xmm1, %xmm3
pmovmskb %xmm3, %eax
CHECK_CMP (%ecx, %eax)
jnz L(ret_nonzero_vec_start_2_3)
cmpl $(CHAR_PER_VEC * 6 - SIZE_OFFSET), %edx
jbe L(last_2x_vec)
movups (VEC_SIZE * 4)(%rsi), %xmm0
movups (VEC_SIZE * 4)(%rdi), %xmm1
PCMPEQ %xmm0, %xmm1
movups (VEC_SIZE * 5)(%rsi), %xmm2
movups (VEC_SIZE * 5)(%rdi), %xmm3
PCMPEQ %xmm2, %xmm3
pand %xmm1, %xmm3
pmovmskb %xmm3, %eax
CHECK_CMP (%ecx, %eax)
#ifdef USE_AS_MEMCMPEQ
jz L(last_2x_vec)
ret
#else
jnz L(ret_nonzero_vec_start_4_5)
#endif
.p2align 4
L(last_2x_vec):
movups (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
movups (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
PCMPEQ %xmm0, %xmm1
movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm2
movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm3
PCMPEQ %xmm2, %xmm3
pand %xmm1, %xmm3
pmovmskb %xmm3, %eax
subl %ecx, %eax
#ifdef USE_AS_MEMCMPEQ
/* Various return targets for memcmpeq. Will always be hot in
Icache and get short encoding. */
L(ret_nonzero_vec_start_2_3):
L(ret_nonzero_vec_start_4_5):
ret
#else
jnz L(ret_nonzero_vec_end_1)
ret
.p2align 4,, 8
L(ret_nonzero_vec_end_1):
pmovmskb %xmm1, %ecx
/* High 16 bits of eax guranteed to be all ones. Rotate them in
to we can do `or + not` with just `xor`. */
rorl $16, %eax
xorl %ecx, %eax
/* Partial register stall. */
bsfl %eax, %eax
# ifdef USE_AS_WMEMCMP
leal (%rax, %rdx, CHAR_SIZE), %eax
movl (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %ecx
xorl %edx, %edx
cmpl (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx
/* NB: no partial register stall here because xorl zero idiom
above. */
setg %dl
leal -1(%rdx, %rdx), %eax
# else
addl %edx, %eax
movzbl (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx
movzbl (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %eax
subl %ecx, %eax
# endif
ret
.p2align 4
L(ret_nonzero_vec_start_4_5):
pmovmskb %xmm1, %edx
sall $16, %eax
leal 1(%rax, %rdx), %eax
bsfl %eax, %eax
# ifdef USE_AS_WMEMCMP
movl (VEC_SIZE * 4)(%rdi, %rax), %ecx
xorl %edx, %edx
cmpl (VEC_SIZE * 4)(%rsi, %rax), %ecx
/* NB: no partial register stall here because xorl zero idiom
above. */
setg %dl
leal -1(%rdx, %rdx), %eax
# else
movzbl (VEC_SIZE * 4)(%rsi, %rax), %ecx
movzbl (VEC_SIZE * 4)(%rdi, %rax), %eax
subl %ecx, %eax
# endif
ret
.p2align 4,, 8
L(ret_nonzero_vec_start_1):
bsfl %eax, %eax
# ifdef USE_AS_WMEMCMP
movl (VEC_SIZE * 1)(%rdi, %rax), %ecx
xorl %edx, %edx
cmpl (VEC_SIZE * 1)(%rsi, %rax), %ecx
/* NB: no partial register stall here because xorl zero idiom
above. */
setg %dl
leal -1(%rdx, %rdx), %eax
# else
movzbl (VEC_SIZE * 1)(%rsi, %rax), %ecx
movzbl (VEC_SIZE * 1)(%rdi, %rax), %eax
subl %ecx, %eax
# endif
ret
#endif
.p2align 4
L(more_8x_vec):
subq %rdi, %rsi
leaq (VEC_SIZE * -6 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %rdx
andq $(VEC_SIZE * -1), %rdi
addq %rdi, %rsi
.p2align 4
L(loop_4x):
movups (VEC_SIZE * 2)(%rsi), %xmm0
movups (VEC_SIZE * 3)(%rsi), %xmm1
PCMPEQ (VEC_SIZE * 2)(%rdi), %xmm0
PCMPEQ (VEC_SIZE * 3)(%rdi), %xmm1
movups (VEC_SIZE * 4)(%rsi), %xmm2
movups (VEC_SIZE * 5)(%rsi), %xmm3
PCMPEQ (VEC_SIZE * 4)(%rdi), %xmm2
PCMPEQ (VEC_SIZE * 5)(%rdi), %xmm3
pand %xmm0, %xmm1
pand %xmm2, %xmm3
pand %xmm1, %xmm3
pmovmskb %xmm3, %eax
subl %ecx, %eax
jnz L(ret_nonzero_loop)
addq $(VEC_SIZE * 4), %rdi
addq $(VEC_SIZE * 4), %rsi
cmpq %rdi, %rdx
ja L(loop_4x)
/* Get remaining length in edx. */
subl %edi, %edx
/* Restore offset so we can reuse L(last_2x_vec). */
addl $(VEC_SIZE * 6 - SIZE_OFFSET), %edx
#ifdef USE_AS_WMEMCMP
shrl $2, %edx
#endif
cmpl $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %edx
jbe L(last_2x_vec)
movups (VEC_SIZE * 2)(%rsi), %xmm0
movups (VEC_SIZE * 2)(%rdi), %xmm1
PCMPEQ %xmm0, %xmm1
movups (VEC_SIZE * 3)(%rsi), %xmm2
movups (VEC_SIZE * 3)(%rdi), %xmm3
PCMPEQ %xmm2, %xmm3
pand %xmm1, %xmm3
pmovmskb %xmm3, %eax
CHECK_CMP (%ecx, %eax)
jz L(last_2x_vec)
#ifdef USE_AS_MEMCMPEQ
L(ret_nonzero_loop):
ret
#else
.p2align 4
L(ret_nonzero_vec_start_2_3):
pmovmskb %xmm1, %edx
sall $16, %eax
leal 1(%rax, %rdx), %eax
bsfl %eax, %eax
# ifdef USE_AS_WMEMCMP
movl (VEC_SIZE * 2)(%rdi, %rax), %ecx
xorl %edx, %edx
cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx
/* NB: no partial register stall here because xorl zero idiom
above. */
setg %dl
leal -1(%rdx, %rdx), %eax
# else
movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx
movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax
subl %ecx, %eax
# endif
ret
.p2align 4
L(ret_nonzero_loop):
pmovmskb %xmm0, %ecx
pmovmskb %xmm1, %edx
sall $(VEC_SIZE * 1), %edx
leal 1(%rcx, %rdx), %edx
pmovmskb %xmm2, %ecx
/* High 16 bits of eax guranteed to be all ones. Rotate them in
to we can do `or + not` with just `xor`. */
rorl $16, %eax
xorl %ecx, %eax
salq $32, %rax
orq %rdx, %rax
bsfq %rax, %rax
# ifdef USE_AS_WMEMCMP
movl (VEC_SIZE * 2)(%rdi, %rax), %ecx
xorl %edx, %edx
cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx
/* NB: no partial register stall here because xorl zero idiom
above. */
setg %dl
leal -1(%rdx, %rdx), %eax
# else
movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx
movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax
subl %ecx, %eax
# endif
ret
#endif
END(MEMCMP)
#ifndef USE_AS_WMEMCMP
# ifdef USE_AS_MEMCMPEQ
libc_hidden_def (MEMCMP)
# else
# undef bcmp
weak_alias (MEMCMP, bcmp)
libc_hidden_builtin_def (MEMCMP)
# endif
#endif
libc_hidden_builtin_def(memcmp)
weak_alias (memcmp, bcmp)

View File

@ -1,4 +1,4 @@
/* __memcmpeq optimized with SSE2.
/* __memcmpeq hook for non-multiarch and RTLD build.
Copyright (C) 2017-2022 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@ -16,6 +16,12 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define MEMCMP __memcmpeq
#define USE_AS_MEMCMPEQ 1
#include "multiarch/memcmp-sse2.S"
#define MEMCMPEQ __memcmpeq
#define DEFAULT_IMPL_V1 "multiarch/memcmpeq-sse2.S"
#define DEFAULT_IMPL_V3 "multiarch/memcmpeq-avx2.S"
#define DEFAULT_IMPL_V4 "multiarch/memcmpeq-evex.S"
#include "isa-default-impl.h"
libc_hidden_def(__memcmpeq)

View File

@ -36,21 +36,25 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/memcmpeq.c. */
IFUNC_IMPL (i, name, __memcmpeq,
IFUNC_IMPL_ADD (array, i, __memcmpeq,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)),
__memcmpeq_avx2)
IFUNC_IMPL_ADD (array, i, __memcmpeq,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (RTM)),
__memcmpeq_avx2_rtm)
IFUNC_IMPL_ADD (array, i, __memcmpeq,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__memcmpeq_evex)
IFUNC_IMPL_ADD (array, i, __memcmpeq, 1, __memcmpeq_sse2))
X86_IFUNC_IMPL_ADD_V4 (array, i, __memcmpeq,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__memcmpeq_evex)
X86_IFUNC_IMPL_ADD_V3 (array, i, __memcmpeq,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)),
__memcmpeq_avx2)
X86_IFUNC_IMPL_ADD_V3 (array, i, __memcmpeq,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (RTM)),
__memcmpeq_avx2_rtm)
/* ISA V2 wrapper for SSE2 implementation because the SSE2
implementation is also used at ISA level 2. */
X86_IFUNC_IMPL_ADD_V2 (array, i, __memcmpeq,
1,
__memcmpeq_sse2))
/* Support sysdeps/x86_64/multiarch/memchr.c. */
IFUNC_IMPL (i, name, memchr,
@ -79,24 +83,31 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/memcmp.c. */
IFUNC_IMPL (i, name, memcmp,
IFUNC_IMPL_ADD (array, i, memcmp,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (MOVBE)),
__memcmp_avx2_movbe)
IFUNC_IMPL_ADD (array, i, memcmp,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (MOVBE)
&& CPU_FEATURE_USABLE (RTM)),
__memcmp_avx2_movbe_rtm)
IFUNC_IMPL_ADD (array, i, memcmp,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (MOVBE)),
__memcmp_evex_movbe)
IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
/* NB: If any of these names change or if any new
implementations are added be sure to update
sysdeps/x86_64/memcmp-isa-default-impl.h. */
X86_IFUNC_IMPL_ADD_V4 (array, i, memcmp,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (MOVBE)),
__memcmp_evex_movbe)
X86_IFUNC_IMPL_ADD_V3 (array, i, memcmp,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (MOVBE)),
__memcmp_avx2_movbe)
X86_IFUNC_IMPL_ADD_V3 (array, i, memcmp,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (MOVBE)
&& CPU_FEATURE_USABLE (RTM)),
__memcmp_avx2_movbe_rtm)
/* ISA V2 wrapper for SSE2 implementation because the SSE2
implementation is also used at ISA level 2. */
X86_IFUNC_IMPL_ADD_V2 (array, i, memcmp,
1,
__memcmp_sse2))
#ifdef SHARED
/* Support sysdeps/x86_64/multiarch/memmove_chk.c. */
@ -805,24 +816,28 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/wmemcmp.c. */
IFUNC_IMPL (i, name, wmemcmp,
IFUNC_IMPL_ADD (array, i, wmemcmp,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (MOVBE)),
__wmemcmp_avx2_movbe)
IFUNC_IMPL_ADD (array, i, wmemcmp,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (MOVBE)
&& CPU_FEATURE_USABLE (RTM)),
__wmemcmp_avx2_movbe_rtm)
IFUNC_IMPL_ADD (array, i, wmemcmp,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (MOVBE)),
__wmemcmp_evex_movbe)
IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
X86_IFUNC_IMPL_ADD_V4 (array, i, wmemcmp,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (MOVBE)),
__wmemcmp_evex_movbe)
X86_IFUNC_IMPL_ADD_V3 (array, i, wmemcmp,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (MOVBE)),
__wmemcmp_avx2_movbe)
X86_IFUNC_IMPL_ADD_V3 (array, i, wmemcmp,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (MOVBE)
&& CPU_FEATURE_USABLE (RTM)),
__wmemcmp_avx2_movbe_rtm)
/* ISA V2 wrapper for SSE2 implementation because the SSE2
implementation is also used at ISA level 2. */
X86_IFUNC_IMPL_ADD_V2 (array, i, wmemcmp,
1,
__wmemcmp_sse2))
/* Support sysdeps/x86_64/multiarch/wmemset.c. */
IFUNC_IMPL (i, name, wmemset,

View File

@ -19,29 +19,33 @@
# include <init-arch.h>
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
static inline void *
IFUNC_SELECTOR (void)
{
const struct cpu_features* cpu_features = __get_cpu_features ();
const struct cpu_features *cpu_features = __get_cpu_features ();
if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
&& CPU_FEATURE_USABLE_P (cpu_features, MOVBE)
&& CPU_FEATURE_USABLE_P (cpu_features, BMI2)
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2)
&& X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, MOVBE)
&& X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)
&& X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
AVX_Fast_Unaligned_Load, ))
{
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
&& CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
&& X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
return OPTIMIZE (evex_movbe);
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
return OPTIMIZE (avx2_movbe_rtm);
if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
Prefer_No_VZEROUPPER, !))
return OPTIMIZE (avx2_movbe);
}

View File

@ -19,28 +19,32 @@
# include <init-arch.h>
extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_rtm) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2) attribute_hidden;
static inline void *
IFUNC_SELECTOR (void)
{
const struct cpu_features* cpu_features = __get_cpu_features ();
const struct cpu_features *cpu_features = __get_cpu_features ();
if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
&& CPU_FEATURE_USABLE_P (cpu_features, BMI2)
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2)
&& X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)
&& X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
AVX_Fast_Unaligned_Load, ))
{
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
&& CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
&& X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
return OPTIMIZE1 (evex);
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
return OPTIMIZE1 (avx2_rtm);
if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
Prefer_No_VZEROUPPER, !))
return OPTIMIZE1 (avx2);
}

View File

@ -16,7 +16,9 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#if IS_IN (libc)
#include <isa-level.h>
#if ISA_SHOULD_BUILD (3)
/* memcmp/wmemcmp is implemented as:
1. Use ymm vector compares when possible. The only case where

View File

@ -16,7 +16,10 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#if IS_IN (libc)
#include <isa-level.h>
#if ISA_SHOULD_BUILD (4)
/* memcmp/wmemcmp is implemented as:
1. Use ymm vector compares when possible. The only case where

View File

@ -16,24 +16,565 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#if IS_IN (libc)
#include <isa-level.h>
/* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation
so we need this to build for ISA V2 builds. */
#if ISA_SHOULD_BUILD (2)
#include <sysdep.h>
# ifndef MEMCMP
# define MEMCMP __memcmp_sse2
# endif
# ifdef SHARED
# undef libc_hidden_builtin_def
# define libc_hidden_builtin_def(name)
# undef libc_hidden_def
# define libc_hidden_def(ignored)
# ifdef USE_AS_WMEMCMP
# define PCMPEQ pcmpeqd
# define CHAR_SIZE 4
# define SIZE_OFFSET (0)
# else
# define PCMPEQ pcmpeqb
# define CHAR_SIZE 1
# endif
# undef weak_alias
# define weak_alias(ignored1, ignored2)
# ifdef USE_AS_MEMCMPEQ
# define SIZE_OFFSET (0)
# define CHECK_CMP(x, y) subl x, y
# else
# ifndef SIZE_OFFSET
# define SIZE_OFFSET (CHAR_PER_VEC * 2)
# endif
# define CHECK_CMP(x, y) cmpl x, y
# endif
# undef strong_alias
# define strong_alias(ignored1, ignored2)
# define VEC_SIZE 16
# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
# ifndef MEMCMP
# define MEMCMP memcmp
# endif
.text
ENTRY(MEMCMP)
# ifdef __ILP32__
/* Clear the upper 32 bits. */
movl %edx, %edx
# endif
# ifdef USE_AS_WMEMCMP
/* Use 0xffff to test for mismatches on pmovmskb bitmask. Store
in ecx for code size. This is preferable to using `incw` as
it avoids partial register stalls on older hardware (pre
SnB). */
movl $0xffff, %ecx
# endif
cmpq $CHAR_PER_VEC, %rdx
ja L(more_1x_vec)
# ifdef USE_AS_WMEMCMP
/* saves a byte of code keeping the fall through path n = [2, 4]
in the initial cache line. */
decl %edx
jle L(cmp_0_1)
movq (%rsi), %xmm0
movq (%rdi), %xmm1
PCMPEQ %xmm0, %xmm1
pmovmskb %xmm1, %eax
subl %ecx, %eax
jnz L(ret_nonzero_vec_start_0)
movq -4(%rsi, %rdx, CHAR_SIZE), %xmm0
movq -4(%rdi, %rdx, CHAR_SIZE), %xmm1
PCMPEQ %xmm0, %xmm1
pmovmskb %xmm1, %eax
subl %ecx, %eax
jnz L(ret_nonzero_vec_end_0_adj)
# else
cmpl $8, %edx
ja L(cmp_9_16)
cmpl $4, %edx
jb L(cmp_0_3)
# ifdef USE_AS_MEMCMPEQ
movl (%rsi), %eax
subl (%rdi), %eax
movl -4(%rsi, %rdx), %esi
subl -4(%rdi, %rdx), %esi
orl %esi, %eax
ret
# else
/* Combine comparisons for lo and hi 4-byte comparisons. */
movl -4(%rsi, %rdx), %ecx
movl -4(%rdi, %rdx), %eax
shlq $32, %rcx
shlq $32, %rax
movl (%rsi), %esi
movl (%rdi), %edi
orq %rsi, %rcx
orq %rdi, %rax
/* Only compute proper return if not-equal. */
cmpq %rcx, %rax
jnz L(ret_nonzero)
xorl %eax, %eax
ret
# endif
.p2align 4,, 10
L(cmp_9_16):
# ifdef USE_AS_MEMCMPEQ
movq (%rsi), %rax
subq (%rdi), %rax
movq -8(%rsi, %rdx), %rcx
subq -8(%rdi, %rdx), %rcx
orq %rcx, %rax
/* Convert 64 bit -> 32 bit boolean (we should have made the ABI
return long). */
setnz %cl
movzbl %cl, %eax
# else
movq (%rsi), %rcx
movq (%rdi), %rax
/* Only compute proper return if not-equal. */
cmpq %rcx, %rax
jnz L(ret_nonzero)
movq -8(%rsi, %rdx, CHAR_SIZE), %rcx
movq -8(%rdi, %rdx, CHAR_SIZE), %rax
/* Only compute proper return if not-equal. */
cmpq %rcx, %rax
jnz L(ret_nonzero)
xorl %eax, %eax
# endif
# endif
ret
.p2align 4,, 8
L(cmp_0_1):
/* Flag set by earlier comparison against 1. */
jne L(cmp_0_0)
# ifdef USE_AS_WMEMCMP
movl (%rdi), %ecx
xorl %edx, %edx
cmpl (%rsi), %ecx
je L(cmp_0_0)
setg %dl
leal -1(%rdx, %rdx), %eax
# else
movzbl (%rdi), %eax
movzbl (%rsi), %ecx
subl %ecx, %eax
# endif
ret
/* Fits in aligning bytes. */
L(cmp_0_0):
xorl %eax, %eax
ret
# ifdef USE_AS_WMEMCMP
.p2align 4
L(ret_nonzero_vec_start_0):
bsfl %eax, %eax
movl (%rdi, %rax), %ecx
xorl %edx, %edx
cmpl (%rsi, %rax), %ecx
/* NB: no partial register stall here because xorl zero idiom
above. */
setg %dl
leal -1(%rdx, %rdx), %eax
ret
# else
# ifndef USE_AS_MEMCMPEQ
.p2align 4,, 14
L(ret_nonzero):
/* Need to bswap to get proper return without branch. */
bswapq %rcx
bswapq %rax
subq %rcx, %rax
sbbl %eax, %eax
orl $1, %eax
ret
# endif
.p2align 4
L(cmp_0_3):
# ifdef USE_AS_MEMCMPEQ
/* No reason to add to dependency chain on rdx. Saving a the
bytes here doesn't change number of fetch blocks. */
cmpl $1, %edx
jbe L(cmp_0_1)
# else
/* We need the code size to prevent taking an extra fetch block.
*/
decl %edx
jle L(cmp_0_1)
# endif
movzwl (%rsi), %ecx
movzwl (%rdi), %eax
# ifdef USE_AS_MEMCMPEQ
subl %ecx, %eax
movzbl -1(%rsi, %rdx), %esi
movzbl -1(%rdi, %rdx), %edi
subl %edi, %esi
orl %esi, %eax
# else
bswapl %ecx
bswapl %eax
/* Implicit right shift by one. We just need to displace the
sign bits. */
shrl %ecx
shrl %eax
/* Eat a partial register stall here. Saves code stopping
L(cmp_0_3) from bleeding into the next fetch block and saves
an ALU. */
movb (%rsi, %rdx), %cl
movzbl (%rdi, %rdx), %edi
orl %edi, %eax
subl %ecx, %eax
# endif
ret
# endif
.p2align 5
L(more_1x_vec):
# ifndef USE_AS_WMEMCMP
/* Use 0xffff to test for mismatches on pmovmskb bitmask. Store
in ecx for code size. This is preferable to using `incw` as
it avoids partial register stalls on older hardware (pre
SnB). */
movl $0xffff, %ecx
# endif
movups (%rsi), %xmm0
movups (%rdi), %xmm1
PCMPEQ %xmm0, %xmm1
pmovmskb %xmm1, %eax
subl %ecx, %eax
jnz L(ret_nonzero_vec_start_0)
# if SIZE_OFFSET == 0
cmpq $(CHAR_PER_VEC * 2), %rdx
# else
/* Offset rdx. Saves just enough code size to keep the
L(last_2x_vec) case and the non-zero return in a single
cache line. */
subq $(CHAR_PER_VEC * 2), %rdx
# endif
ja L(more_2x_vec)
movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
PCMPEQ %xmm0, %xmm1
pmovmskb %xmm1, %eax
subl %ecx, %eax
# ifndef USE_AS_MEMCMPEQ
/* Don't use `incw ax` as machines this code runs on are liable
to have partial register stall. */
jnz L(ret_nonzero_vec_end_0)
# else
/* Various return targets for memcmpeq. Will always be hot in
Icache and get short encoding. */
L(ret_nonzero_vec_start_1):
L(ret_nonzero_vec_start_0):
L(ret_nonzero_vec_end_0):
# endif
ret
# ifndef USE_AS_MEMCMPEQ
# ifdef USE_AS_WMEMCMP
.p2align 4
L(ret_nonzero_vec_end_0_adj):
addl $3, %edx
# else
.p2align 4,, 8
# endif
L(ret_nonzero_vec_end_0):
bsfl %eax, %eax
# ifdef USE_AS_WMEMCMP
leal (%rax, %rdx, CHAR_SIZE), %eax
movl (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %ecx
xorl %edx, %edx
cmpl (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx
/* NB: no partial register stall here because xorl zero idiom
above. */
setg %dl
leal -1(%rdx, %rdx), %eax
# else
addl %edx, %eax
movzbl (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx
movzbl (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %eax
subl %ecx, %eax
# endif
ret
# ifndef USE_AS_WMEMCMP
.p2align 4,, 10
L(ret_nonzero_vec_start_0):
bsfl %eax, %eax
movzbl (%rsi, %rax), %ecx
movzbl (%rdi, %rax), %eax
subl %ecx, %eax
ret
# endif
# else
# endif
.p2align 5
L(more_2x_vec):
movups (VEC_SIZE * 1)(%rsi), %xmm0
movups (VEC_SIZE * 1)(%rdi), %xmm1
PCMPEQ %xmm0, %xmm1
pmovmskb %xmm1, %eax
subl %ecx, %eax
jnz L(ret_nonzero_vec_start_1)
cmpq $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %rdx
jbe L(last_2x_vec)
cmpq $(CHAR_PER_VEC * 8 - SIZE_OFFSET), %rdx
ja L(more_8x_vec)
/* Do comparisons for [65, 96] and [97, 128] 2x VEC at a time.
This can harm performance if non-zero return in [65, 80] or
[97, 112] but helps performance otherwise. Generally zero-
return is hotter. */
movups (VEC_SIZE * 2)(%rsi), %xmm0
movups (VEC_SIZE * 2)(%rdi), %xmm1
PCMPEQ %xmm0, %xmm1
movups (VEC_SIZE * 3)(%rsi), %xmm2
movups (VEC_SIZE * 3)(%rdi), %xmm3
PCMPEQ %xmm2, %xmm3
pand %xmm1, %xmm3
pmovmskb %xmm3, %eax
CHECK_CMP (%ecx, %eax)
jnz L(ret_nonzero_vec_start_2_3)
cmpl $(CHAR_PER_VEC * 6 - SIZE_OFFSET), %edx
jbe L(last_2x_vec)
movups (VEC_SIZE * 4)(%rsi), %xmm0
movups (VEC_SIZE * 4)(%rdi), %xmm1
PCMPEQ %xmm0, %xmm1
movups (VEC_SIZE * 5)(%rsi), %xmm2
movups (VEC_SIZE * 5)(%rdi), %xmm3
PCMPEQ %xmm2, %xmm3
pand %xmm1, %xmm3
pmovmskb %xmm3, %eax
CHECK_CMP (%ecx, %eax)
# ifdef USE_AS_MEMCMPEQ
jz L(last_2x_vec)
ret
# else
jnz L(ret_nonzero_vec_start_4_5)
# endif
.p2align 4
L(last_2x_vec):
movups (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
movups (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
PCMPEQ %xmm0, %xmm1
movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm2
movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm3
PCMPEQ %xmm2, %xmm3
pand %xmm1, %xmm3
pmovmskb %xmm3, %eax
subl %ecx, %eax
# ifdef USE_AS_MEMCMPEQ
/* Various return targets for memcmpeq. Will always be hot in
Icache and get short encoding. */
L(ret_nonzero_vec_start_2_3):
L(ret_nonzero_vec_start_4_5):
ret
# else
jnz L(ret_nonzero_vec_end_1)
ret
.p2align 4,, 8
L(ret_nonzero_vec_end_1):
pmovmskb %xmm1, %ecx
/* High 16 bits of eax guranteed to be all ones. Rotate them in
to we can do `or + not` with just `xor`. */
rorl $16, %eax
xorl %ecx, %eax
/* Partial register stall. */
bsfl %eax, %eax
# ifdef USE_AS_WMEMCMP
leal (%rax, %rdx, CHAR_SIZE), %eax
movl (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %ecx
xorl %edx, %edx
cmpl (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx
/* NB: no partial register stall here because xorl zero idiom
above. */
setg %dl
leal -1(%rdx, %rdx), %eax
# else
addl %edx, %eax
movzbl (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx
movzbl (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %eax
subl %ecx, %eax
# endif
ret
.p2align 4
L(ret_nonzero_vec_start_4_5):
pmovmskb %xmm1, %edx
sall $16, %eax
leal 1(%rax, %rdx), %eax
bsfl %eax, %eax
# ifdef USE_AS_WMEMCMP
movl (VEC_SIZE * 4)(%rdi, %rax), %ecx
xorl %edx, %edx
cmpl (VEC_SIZE * 4)(%rsi, %rax), %ecx
/* NB: no partial register stall here because xorl zero idiom
above. */
setg %dl
leal -1(%rdx, %rdx), %eax
# else
movzbl (VEC_SIZE * 4)(%rsi, %rax), %ecx
movzbl (VEC_SIZE * 4)(%rdi, %rax), %eax
subl %ecx, %eax
# endif
ret
.p2align 4,, 8
L(ret_nonzero_vec_start_1):
bsfl %eax, %eax
# ifdef USE_AS_WMEMCMP
movl (VEC_SIZE * 1)(%rdi, %rax), %ecx
xorl %edx, %edx
cmpl (VEC_SIZE * 1)(%rsi, %rax), %ecx
/* NB: no partial register stall here because xorl zero idiom
above. */
setg %dl
leal -1(%rdx, %rdx), %eax
# else
movzbl (VEC_SIZE * 1)(%rsi, %rax), %ecx
movzbl (VEC_SIZE * 1)(%rdi, %rax), %eax
subl %ecx, %eax
# endif
ret
# endif
.p2align 4
L(more_8x_vec):
subq %rdi, %rsi
leaq (VEC_SIZE * -6 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %rdx
andq $(VEC_SIZE * -1), %rdi
addq %rdi, %rsi
.p2align 4
L(loop_4x):
movups (VEC_SIZE * 2)(%rsi), %xmm0
movups (VEC_SIZE * 3)(%rsi), %xmm1
PCMPEQ (VEC_SIZE * 2)(%rdi), %xmm0
PCMPEQ (VEC_SIZE * 3)(%rdi), %xmm1
movups (VEC_SIZE * 4)(%rsi), %xmm2
movups (VEC_SIZE * 5)(%rsi), %xmm3
PCMPEQ (VEC_SIZE * 4)(%rdi), %xmm2
PCMPEQ (VEC_SIZE * 5)(%rdi), %xmm3
pand %xmm0, %xmm1
pand %xmm2, %xmm3
pand %xmm1, %xmm3
pmovmskb %xmm3, %eax
subl %ecx, %eax
jnz L(ret_nonzero_loop)
addq $(VEC_SIZE * 4), %rdi
addq $(VEC_SIZE * 4), %rsi
cmpq %rdi, %rdx
ja L(loop_4x)
/* Get remaining length in edx. */
subl %edi, %edx
/* Restore offset so we can reuse L(last_2x_vec). */
addl $(VEC_SIZE * 6 - SIZE_OFFSET), %edx
# ifdef USE_AS_WMEMCMP
shrl $2, %edx
# endif
cmpl $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %edx
jbe L(last_2x_vec)
movups (VEC_SIZE * 2)(%rsi), %xmm0
movups (VEC_SIZE * 2)(%rdi), %xmm1
PCMPEQ %xmm0, %xmm1
movups (VEC_SIZE * 3)(%rsi), %xmm2
movups (VEC_SIZE * 3)(%rdi), %xmm3
PCMPEQ %xmm2, %xmm3
pand %xmm1, %xmm3
pmovmskb %xmm3, %eax
CHECK_CMP (%ecx, %eax)
jz L(last_2x_vec)
# ifdef USE_AS_MEMCMPEQ
L(ret_nonzero_loop):
ret
# else
.p2align 4
L(ret_nonzero_vec_start_2_3):
pmovmskb %xmm1, %edx
sall $16, %eax
leal 1(%rax, %rdx), %eax
bsfl %eax, %eax
# ifdef USE_AS_WMEMCMP
movl (VEC_SIZE * 2)(%rdi, %rax), %ecx
xorl %edx, %edx
cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx
/* NB: no partial register stall here because xorl zero idiom
above. */
setg %dl
leal -1(%rdx, %rdx), %eax
# else
movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx
movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax
subl %ecx, %eax
# endif
ret
.p2align 4
L(ret_nonzero_loop):
pmovmskb %xmm0, %ecx
pmovmskb %xmm1, %edx
sall $(VEC_SIZE * 1), %edx
leal 1(%rcx, %rdx), %edx
pmovmskb %xmm2, %ecx
/* High 16 bits of eax guranteed to be all ones. Rotate them in
to we can do `or + not` with just `xor`. */
rorl $16, %eax
xorl %ecx, %eax
salq $32, %rax
orq %rdx, %rax
bsfq %rax, %rax
# ifdef USE_AS_WMEMCMP
movl (VEC_SIZE * 2)(%rdi, %rax), %ecx
xorl %edx, %edx
cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx
/* NB: no partial register stall here because xorl zero idiom
above. */
setg %dl
leal -1(%rdx, %rdx), %eax
# else
movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx
movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax
subl %ecx, %eax
# endif
ret
# endif
END(MEMCMP)
#endif
#include <sysdeps/x86_64/memcmp.S>

View File

@ -16,7 +16,9 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#if IS_IN (libc)
#include <isa-level.h>
#if ISA_SHOULD_BUILD (3)
/* __memcmpeq is implemented as:
1. Use ymm vector compares when possible. The only case where

View File

@ -16,7 +16,9 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#if IS_IN (libc)
#include <isa-level.h>
#if ISA_SHOULD_BUILD (4)
/* __memcmpeq is implemented as:
1. Use ymm vector compares when possible. The only case where

View File

@ -16,10 +16,10 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#if IS_IN (libc)
# define MEMCMP __memcmpeq_sse2
#else
# define MEMCMP __memcmpeq
#ifndef MEMCMPEQ
# define MEMCMPEQ __memcmpeq_sse2
#endif
#define MEMCMP MEMCMPEQ
#define USE_AS_MEMCMPEQ 1
#include "memcmp-sse2.S"

View File

@ -0,0 +1,18 @@
/* Copyright (C) 2022 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include "../memcmp.S"

View File

@ -0,0 +1,18 @@
/* Copyright (C) 2022 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include "../memcmpeq.S"

View File

@ -0,0 +1,18 @@
/* Copyright (C) 2022 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include "../wmemcmp.S"

View File

@ -1,4 +1,8 @@
#define MEMCMP __wmemcmp_avx2_movbe
#ifndef WMEMCMP
# define WMEMCMP __wmemcmp_avx2_movbe
#endif
#define MEMCMP WMEMCMP
#define USE_AS_WMEMCMP 1
#include "memcmp-avx2-movbe.S"

View File

@ -1,4 +1,8 @@
#define MEMCMP __wmemcmp_evex_movbe
#ifndef WMEMCMP
# define WMEMCMP __wmemcmp_evex_movbe
#endif
#define MEMCMP WMEMCMP
#define USE_AS_WMEMCMP 1
#include "memcmp-evex-movbe.S"

View File

@ -16,6 +16,11 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#ifndef WMEMCMP
# define WMEMCMP __wmemcmp_sse2
#endif
#define MEMCMP WMEMCMP
#define USE_AS_WMEMCMP 1
#define MEMCMP __wmemcmp_sse2
#include "../memcmp.S"
#include "memcmp-sse2.S"

View File

@ -1,4 +1,4 @@
/* wmemcmp optimized with SSE2.
/* wmemcmp hook for non-multiarch and RTLD build.
Copyright (C) 2022 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@ -16,8 +16,12 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define MEMCMP __wmemcmp
#define USE_AS_WMEMCMP 1
#include "memcmp.S"
#define WMEMCMP __wmemcmp
#define DEFAULT_IMPL_V1 "multiarch/wmemcmp-sse2.S"
#define DEFAULT_IMPL_V3 "multiarch/wmemcmp-avx2-movbe.S"
#define DEFAULT_IMPL_V4 "multiarch/wmemcmp-evex-movbe.S"
#include "isa-default-impl.h"
weak_alias (__wmemcmp, wmemcmp)