mirror of
git://sourceware.org/git/glibc.git
synced 2025-01-18 12:16:13 +08:00
2f9062d717
No bug. This implementation refactors memcmp-sse4.S primarily with minimizing code size in mind. It does this by removing the lookup table logic and removing the unrolled check from (256, 512] bytes. memcmp-sse4 code size reduction : -3487 bytes wmemcmp-sse4 code size reduction: -1472 bytes The current memcmp-sse4.S implementation has a large code size cost. This has serious adverse affects on the ICache / ITLB. While in micro-benchmarks the implementations appears fast, traces of real-world code have shown that the speed in micro benchmarks does not translate when the ICache/ITLB are not primed, and that the cost of the code size has measurable negative affects on overall application performance. See https://research.google/pubs/pub48320/ for more details. Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
804 lines
14 KiB
ArmAsm
804 lines
14 KiB
ArmAsm
/* memcmp with SSE4.1, wmemcmp with SSE4.1
|
|
Copyright (C) 2010-2021 Free Software Foundation, Inc.
|
|
This file is part of the GNU C Library.
|
|
|
|
The GNU C Library is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU Lesser General Public
|
|
License as published by the Free Software Foundation; either
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
The GNU C Library is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Lesser General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
License along with the GNU C Library; if not, see
|
|
<https://www.gnu.org/licenses/>. */
|
|
|
|
#if IS_IN (libc)
|
|
|
|
# include <sysdep.h>
|
|
|
|
# ifndef MEMCMP
|
|
# define MEMCMP __memcmp_sse4_1
|
|
# endif
|
|
|
|
#ifdef USE_AS_WMEMCMP
|
|
# define CMPEQ pcmpeqd
|
|
# define CHAR_SIZE 4
|
|
#else
|
|
# define CMPEQ pcmpeqb
|
|
# define CHAR_SIZE 1
|
|
#endif
|
|
|
|
|
|
/* Warning!
|
|
wmemcmp has to use SIGNED comparison for elements.
|
|
memcmp has to use UNSIGNED comparison for elemnts.
|
|
*/
|
|
|
|
.section .text.sse4.1,"ax",@progbits
|
|
ENTRY (MEMCMP)
|
|
# ifdef USE_AS_WMEMCMP
|
|
shl $2, %RDX_LP
|
|
# elif defined __ILP32__
|
|
/* Clear the upper 32 bits. */
|
|
mov %edx, %edx
|
|
# endif
|
|
cmp $79, %RDX_LP
|
|
ja L(79bytesormore)
|
|
|
|
cmp $CHAR_SIZE, %RDX_LP
|
|
jbe L(firstbyte)
|
|
|
|
/* N in (CHAR_SIZE, 79) bytes. */
|
|
cmpl $32, %edx
|
|
ja L(more_32_bytes)
|
|
|
|
cmpl $16, %edx
|
|
jae L(16_to_32_bytes)
|
|
|
|
# ifndef USE_AS_WMEMCMP
|
|
cmpl $8, %edx
|
|
jae L(8_to_16_bytes)
|
|
|
|
cmpl $4, %edx
|
|
jb L(2_to_3_bytes)
|
|
|
|
movl (%rdi), %eax
|
|
movl (%rsi), %ecx
|
|
|
|
bswap %eax
|
|
bswap %ecx
|
|
|
|
shlq $32, %rax
|
|
shlq $32, %rcx
|
|
|
|
movl -4(%rdi, %rdx), %edi
|
|
movl -4(%rsi, %rdx), %esi
|
|
|
|
bswap %edi
|
|
bswap %esi
|
|
|
|
orq %rdi, %rax
|
|
orq %rsi, %rcx
|
|
subq %rcx, %rax
|
|
cmovne %edx, %eax
|
|
sbbl %ecx, %ecx
|
|
orl %ecx, %eax
|
|
ret
|
|
|
|
.p2align 4,, 8
|
|
L(2_to_3_bytes):
|
|
movzwl (%rdi), %eax
|
|
movzwl (%rsi), %ecx
|
|
shll $8, %eax
|
|
shll $8, %ecx
|
|
bswap %eax
|
|
bswap %ecx
|
|
movzbl -1(%rdi, %rdx), %edi
|
|
movzbl -1(%rsi, %rdx), %esi
|
|
orl %edi, %eax
|
|
orl %esi, %ecx
|
|
subl %ecx, %eax
|
|
ret
|
|
|
|
.p2align 4,, 8
|
|
L(8_to_16_bytes):
|
|
movq (%rdi), %rax
|
|
movq (%rsi), %rcx
|
|
|
|
bswap %rax
|
|
bswap %rcx
|
|
|
|
subq %rcx, %rax
|
|
jne L(8_to_16_bytes_done)
|
|
|
|
movq -8(%rdi, %rdx), %rax
|
|
movq -8(%rsi, %rdx), %rcx
|
|
|
|
bswap %rax
|
|
bswap %rcx
|
|
|
|
subq %rcx, %rax
|
|
|
|
L(8_to_16_bytes_done):
|
|
cmovne %edx, %eax
|
|
sbbl %ecx, %ecx
|
|
orl %ecx, %eax
|
|
ret
|
|
# else
|
|
xorl %eax, %eax
|
|
movl (%rdi), %ecx
|
|
cmpl (%rsi), %ecx
|
|
jne L(8_to_16_bytes_done)
|
|
movl 4(%rdi), %ecx
|
|
cmpl 4(%rsi), %ecx
|
|
jne L(8_to_16_bytes_done)
|
|
movl -4(%rdi, %rdx), %ecx
|
|
cmpl -4(%rsi, %rdx), %ecx
|
|
jne L(8_to_16_bytes_done)
|
|
ret
|
|
# endif
|
|
|
|
.p2align 4,, 3
|
|
L(ret_zero):
|
|
xorl %eax, %eax
|
|
L(zero):
|
|
ret
|
|
|
|
.p2align 4,, 8
|
|
L(firstbyte):
|
|
jb L(ret_zero)
|
|
# ifdef USE_AS_WMEMCMP
|
|
xorl %eax, %eax
|
|
movl (%rdi), %ecx
|
|
cmpl (%rsi), %ecx
|
|
je L(zero)
|
|
L(8_to_16_bytes_done):
|
|
setg %al
|
|
leal -1(%rax, %rax), %eax
|
|
# else
|
|
movzbl (%rdi), %eax
|
|
movzbl (%rsi), %ecx
|
|
sub %ecx, %eax
|
|
# endif
|
|
ret
|
|
|
|
.p2align 4
|
|
L(vec_return_begin_48):
|
|
addq $16, %rdi
|
|
addq $16, %rsi
|
|
L(vec_return_begin_32):
|
|
bsfl %eax, %eax
|
|
# ifdef USE_AS_WMEMCMP
|
|
movl 32(%rdi, %rax), %ecx
|
|
xorl %edx, %edx
|
|
cmpl 32(%rsi, %rax), %ecx
|
|
setg %dl
|
|
leal -1(%rdx, %rdx), %eax
|
|
# else
|
|
movzbl 32(%rsi, %rax), %ecx
|
|
movzbl 32(%rdi, %rax), %eax
|
|
subl %ecx, %eax
|
|
# endif
|
|
ret
|
|
|
|
.p2align 4
|
|
L(vec_return_begin_16):
|
|
addq $16, %rdi
|
|
addq $16, %rsi
|
|
L(vec_return_begin):
|
|
bsfl %eax, %eax
|
|
# ifdef USE_AS_WMEMCMP
|
|
movl (%rdi, %rax), %ecx
|
|
xorl %edx, %edx
|
|
cmpl (%rsi, %rax), %ecx
|
|
setg %dl
|
|
leal -1(%rdx, %rdx), %eax
|
|
# else
|
|
movzbl (%rsi, %rax), %ecx
|
|
movzbl (%rdi, %rax), %eax
|
|
subl %ecx, %eax
|
|
# endif
|
|
ret
|
|
|
|
.p2align 4
|
|
L(vec_return_end_16):
|
|
subl $16, %edx
|
|
L(vec_return_end):
|
|
bsfl %eax, %eax
|
|
addl %edx, %eax
|
|
# ifdef USE_AS_WMEMCMP
|
|
movl -16(%rdi, %rax), %ecx
|
|
xorl %edx, %edx
|
|
cmpl -16(%rsi, %rax), %ecx
|
|
setg %dl
|
|
leal -1(%rdx, %rdx), %eax
|
|
# else
|
|
movzbl -16(%rsi, %rax), %ecx
|
|
movzbl -16(%rdi, %rax), %eax
|
|
subl %ecx, %eax
|
|
# endif
|
|
ret
|
|
|
|
.p2align 4,, 8
|
|
L(more_32_bytes):
|
|
movdqu (%rdi), %xmm0
|
|
movdqu (%rsi), %xmm1
|
|
CMPEQ %xmm0, %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
incw %ax
|
|
jnz L(vec_return_begin)
|
|
|
|
movdqu 16(%rdi), %xmm0
|
|
movdqu 16(%rsi), %xmm1
|
|
CMPEQ %xmm0, %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
incw %ax
|
|
jnz L(vec_return_begin_16)
|
|
|
|
cmpl $64, %edx
|
|
jbe L(32_to_64_bytes)
|
|
movdqu 32(%rdi), %xmm0
|
|
movdqu 32(%rsi), %xmm1
|
|
CMPEQ %xmm0, %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
incw %ax
|
|
jnz L(vec_return_begin_32)
|
|
|
|
.p2align 4,, 6
|
|
L(32_to_64_bytes):
|
|
movdqu -32(%rdi, %rdx), %xmm0
|
|
movdqu -32(%rsi, %rdx), %xmm1
|
|
CMPEQ %xmm0, %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
incw %ax
|
|
jnz L(vec_return_end_16)
|
|
|
|
movdqu -16(%rdi, %rdx), %xmm0
|
|
movdqu -16(%rsi, %rdx), %xmm1
|
|
CMPEQ %xmm0, %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
incw %ax
|
|
jnz L(vec_return_end)
|
|
ret
|
|
|
|
.p2align 4
|
|
L(16_to_32_bytes):
|
|
movdqu (%rdi), %xmm0
|
|
movdqu (%rsi), %xmm1
|
|
CMPEQ %xmm0, %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
incw %ax
|
|
jnz L(vec_return_begin)
|
|
|
|
movdqu -16(%rdi, %rdx), %xmm0
|
|
movdqu -16(%rsi, %rdx), %xmm1
|
|
CMPEQ %xmm0, %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
incw %ax
|
|
jnz L(vec_return_end)
|
|
ret
|
|
|
|
|
|
.p2align 4
|
|
L(79bytesormore):
|
|
movdqu (%rdi), %xmm0
|
|
movdqu (%rsi), %xmm1
|
|
CMPEQ %xmm0, %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
incw %ax
|
|
jnz L(vec_return_begin)
|
|
|
|
|
|
mov %rsi, %rcx
|
|
and $-16, %rsi
|
|
add $16, %rsi
|
|
sub %rsi, %rcx
|
|
|
|
sub %rcx, %rdi
|
|
add %rcx, %rdx
|
|
test $0xf, %rdi
|
|
jz L(2aligned)
|
|
|
|
cmp $128, %rdx
|
|
ja L(128bytesormore)
|
|
|
|
.p2align 4,, 6
|
|
L(less128bytes):
|
|
movdqu (%rdi), %xmm1
|
|
CMPEQ (%rsi), %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
incw %ax
|
|
jnz L(vec_return_begin)
|
|
|
|
movdqu 16(%rdi), %xmm1
|
|
CMPEQ 16(%rsi), %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
incw %ax
|
|
jnz L(vec_return_begin_16)
|
|
|
|
movdqu 32(%rdi), %xmm1
|
|
CMPEQ 32(%rsi), %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
incw %ax
|
|
jnz L(vec_return_begin_32)
|
|
|
|
movdqu 48(%rdi), %xmm1
|
|
CMPEQ 48(%rsi), %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
incw %ax
|
|
jnz L(vec_return_begin_48)
|
|
|
|
cmp $96, %rdx
|
|
jb L(32_to_64_bytes)
|
|
|
|
addq $64, %rdi
|
|
addq $64, %rsi
|
|
subq $64, %rdx
|
|
|
|
.p2align 4,, 6
|
|
L(last_64_bytes):
|
|
movdqu (%rdi), %xmm1
|
|
CMPEQ (%rsi), %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
incw %ax
|
|
jnz L(vec_return_begin)
|
|
|
|
movdqu 16(%rdi), %xmm1
|
|
CMPEQ 16(%rsi), %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
incw %ax
|
|
jnz L(vec_return_begin_16)
|
|
|
|
movdqu -32(%rdi, %rdx), %xmm0
|
|
movdqu -32(%rsi, %rdx), %xmm1
|
|
CMPEQ %xmm0, %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
incw %ax
|
|
jnz L(vec_return_end_16)
|
|
|
|
movdqu -16(%rdi, %rdx), %xmm0
|
|
movdqu -16(%rsi, %rdx), %xmm1
|
|
CMPEQ %xmm0, %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
incw %ax
|
|
jnz L(vec_return_end)
|
|
ret
|
|
|
|
.p2align 4
|
|
L(128bytesormore):
|
|
cmp $256, %rdx
|
|
ja L(unaligned_loop)
|
|
L(less256bytes):
|
|
movdqu (%rdi), %xmm1
|
|
CMPEQ (%rsi), %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
incw %ax
|
|
jnz L(vec_return_begin)
|
|
|
|
movdqu 16(%rdi), %xmm1
|
|
CMPEQ 16(%rsi), %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
incw %ax
|
|
jnz L(vec_return_begin_16)
|
|
|
|
movdqu 32(%rdi), %xmm1
|
|
CMPEQ 32(%rsi), %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
incw %ax
|
|
jnz L(vec_return_begin_32)
|
|
|
|
movdqu 48(%rdi), %xmm1
|
|
CMPEQ 48(%rsi), %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
incw %ax
|
|
jnz L(vec_return_begin_48)
|
|
|
|
addq $64, %rdi
|
|
addq $64, %rsi
|
|
|
|
movdqu (%rdi), %xmm1
|
|
CMPEQ (%rsi), %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
incw %ax
|
|
jnz L(vec_return_begin)
|
|
|
|
movdqu 16(%rdi), %xmm1
|
|
CMPEQ 16(%rsi), %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
incw %ax
|
|
jnz L(vec_return_begin_16)
|
|
|
|
movdqu 32(%rdi), %xmm1
|
|
CMPEQ 32(%rsi), %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
incw %ax
|
|
jnz L(vec_return_begin_32)
|
|
|
|
movdqu 48(%rdi), %xmm1
|
|
CMPEQ 48(%rsi), %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
incw %ax
|
|
jnz L(vec_return_begin_48)
|
|
|
|
addq $-128, %rdx
|
|
subq $-64, %rsi
|
|
subq $-64, %rdi
|
|
|
|
cmp $64, %rdx
|
|
ja L(less128bytes)
|
|
|
|
cmp $32, %rdx
|
|
ja L(last_64_bytes)
|
|
|
|
movdqu -32(%rdi, %rdx), %xmm0
|
|
movdqu -32(%rsi, %rdx), %xmm1
|
|
CMPEQ %xmm0, %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
incw %ax
|
|
jnz L(vec_return_end_16)
|
|
|
|
movdqu -16(%rdi, %rdx), %xmm0
|
|
movdqu -16(%rsi, %rdx), %xmm1
|
|
CMPEQ %xmm0, %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
incw %ax
|
|
jnz L(vec_return_end)
|
|
ret
|
|
|
|
.p2align 4
|
|
L(unaligned_loop):
|
|
# ifdef DATA_CACHE_SIZE_HALF
|
|
mov $DATA_CACHE_SIZE_HALF, %R8_LP
|
|
# else
|
|
mov __x86_data_cache_size_half(%rip), %R8_LP
|
|
# endif
|
|
movq %r8, %r9
|
|
addq %r8, %r8
|
|
addq %r9, %r8
|
|
cmpq %r8, %rdx
|
|
ja L(L2_L3_cache_unaligned)
|
|
sub $64, %rdx
|
|
.p2align 4
|
|
L(64bytesormore_loop):
|
|
movdqu (%rdi), %xmm0
|
|
movdqu 16(%rdi), %xmm1
|
|
movdqu 32(%rdi), %xmm2
|
|
movdqu 48(%rdi), %xmm3
|
|
|
|
CMPEQ (%rsi), %xmm0
|
|
CMPEQ 16(%rsi), %xmm1
|
|
CMPEQ 32(%rsi), %xmm2
|
|
CMPEQ 48(%rsi), %xmm3
|
|
|
|
pand %xmm0, %xmm1
|
|
pand %xmm2, %xmm3
|
|
pand %xmm1, %xmm3
|
|
|
|
pmovmskb %xmm3, %eax
|
|
incw %ax
|
|
jnz L(64bytesormore_loop_end)
|
|
|
|
add $64, %rsi
|
|
add $64, %rdi
|
|
sub $64, %rdx
|
|
ja L(64bytesormore_loop)
|
|
|
|
.p2align 4,, 6
|
|
L(loop_tail):
|
|
addq %rdx, %rdi
|
|
movdqu (%rdi), %xmm0
|
|
movdqu 16(%rdi), %xmm1
|
|
movdqu 32(%rdi), %xmm2
|
|
movdqu 48(%rdi), %xmm3
|
|
|
|
addq %rdx, %rsi
|
|
movdqu (%rsi), %xmm4
|
|
movdqu 16(%rsi), %xmm5
|
|
movdqu 32(%rsi), %xmm6
|
|
movdqu 48(%rsi), %xmm7
|
|
|
|
CMPEQ %xmm4, %xmm0
|
|
CMPEQ %xmm5, %xmm1
|
|
CMPEQ %xmm6, %xmm2
|
|
CMPEQ %xmm7, %xmm3
|
|
|
|
pand %xmm0, %xmm1
|
|
pand %xmm2, %xmm3
|
|
pand %xmm1, %xmm3
|
|
|
|
pmovmskb %xmm3, %eax
|
|
incw %ax
|
|
jnz L(64bytesormore_loop_end)
|
|
ret
|
|
|
|
L(L2_L3_cache_unaligned):
|
|
subq $64, %rdx
|
|
.p2align 4
|
|
L(L2_L3_unaligned_128bytes_loop):
|
|
prefetchnta 0x1c0(%rdi)
|
|
prefetchnta 0x1c0(%rsi)
|
|
|
|
movdqu (%rdi), %xmm0
|
|
movdqu 16(%rdi), %xmm1
|
|
movdqu 32(%rdi), %xmm2
|
|
movdqu 48(%rdi), %xmm3
|
|
|
|
CMPEQ (%rsi), %xmm0
|
|
CMPEQ 16(%rsi), %xmm1
|
|
CMPEQ 32(%rsi), %xmm2
|
|
CMPEQ 48(%rsi), %xmm3
|
|
|
|
pand %xmm0, %xmm1
|
|
pand %xmm2, %xmm3
|
|
pand %xmm1, %xmm3
|
|
|
|
pmovmskb %xmm3, %eax
|
|
incw %ax
|
|
jnz L(64bytesormore_loop_end)
|
|
|
|
add $64, %rsi
|
|
add $64, %rdi
|
|
sub $64, %rdx
|
|
ja L(L2_L3_unaligned_128bytes_loop)
|
|
jmp L(loop_tail)
|
|
|
|
|
|
/* This case is for machines which are sensitive for unaligned
|
|
* instructions. */
|
|
.p2align 4
|
|
L(2aligned):
|
|
cmp $128, %rdx
|
|
ja L(128bytesormorein2aligned)
|
|
L(less128bytesin2aligned):
|
|
movdqa (%rdi), %xmm1
|
|
CMPEQ (%rsi), %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
incw %ax
|
|
jnz L(vec_return_begin)
|
|
|
|
movdqa 16(%rdi), %xmm1
|
|
CMPEQ 16(%rsi), %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
incw %ax
|
|
jnz L(vec_return_begin_16)
|
|
|
|
movdqa 32(%rdi), %xmm1
|
|
CMPEQ 32(%rsi), %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
incw %ax
|
|
jnz L(vec_return_begin_32)
|
|
|
|
movdqa 48(%rdi), %xmm1
|
|
CMPEQ 48(%rsi), %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
incw %ax
|
|
jnz L(vec_return_begin_48)
|
|
|
|
cmp $96, %rdx
|
|
jb L(32_to_64_bytes)
|
|
|
|
addq $64, %rdi
|
|
addq $64, %rsi
|
|
subq $64, %rdx
|
|
|
|
.p2align 4,, 6
|
|
L(aligned_last_64_bytes):
|
|
movdqa (%rdi), %xmm1
|
|
CMPEQ (%rsi), %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
incw %ax
|
|
jnz L(vec_return_begin)
|
|
|
|
movdqa 16(%rdi), %xmm1
|
|
CMPEQ 16(%rsi), %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
incw %ax
|
|
jnz L(vec_return_begin_16)
|
|
|
|
movdqu -32(%rdi, %rdx), %xmm0
|
|
movdqu -32(%rsi, %rdx), %xmm1
|
|
CMPEQ %xmm0, %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
incw %ax
|
|
jnz L(vec_return_end_16)
|
|
|
|
movdqu -16(%rdi, %rdx), %xmm0
|
|
movdqu -16(%rsi, %rdx), %xmm1
|
|
CMPEQ %xmm0, %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
incw %ax
|
|
jnz L(vec_return_end)
|
|
ret
|
|
|
|
.p2align 4
|
|
L(128bytesormorein2aligned):
|
|
cmp $256, %rdx
|
|
ja L(aligned_loop)
|
|
L(less256bytesin2alinged):
|
|
movdqa (%rdi), %xmm1
|
|
CMPEQ (%rsi), %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
incw %ax
|
|
jnz L(vec_return_begin)
|
|
|
|
movdqa 16(%rdi), %xmm1
|
|
CMPEQ 16(%rsi), %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
incw %ax
|
|
jnz L(vec_return_begin_16)
|
|
|
|
movdqa 32(%rdi), %xmm1
|
|
CMPEQ 32(%rsi), %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
incw %ax
|
|
jnz L(vec_return_begin_32)
|
|
|
|
movdqa 48(%rdi), %xmm1
|
|
CMPEQ 48(%rsi), %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
incw %ax
|
|
jnz L(vec_return_begin_48)
|
|
|
|
addq $64, %rdi
|
|
addq $64, %rsi
|
|
|
|
movdqa (%rdi), %xmm1
|
|
CMPEQ (%rsi), %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
incw %ax
|
|
jnz L(vec_return_begin)
|
|
|
|
movdqa 16(%rdi), %xmm1
|
|
CMPEQ 16(%rsi), %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
incw %ax
|
|
jnz L(vec_return_begin_16)
|
|
|
|
movdqa 32(%rdi), %xmm1
|
|
CMPEQ 32(%rsi), %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
incw %ax
|
|
jnz L(vec_return_begin_32)
|
|
|
|
movdqa 48(%rdi), %xmm1
|
|
CMPEQ 48(%rsi), %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
incw %ax
|
|
jnz L(vec_return_begin_48)
|
|
|
|
addq $-128, %rdx
|
|
subq $-64, %rsi
|
|
subq $-64, %rdi
|
|
|
|
cmp $64, %rdx
|
|
ja L(less128bytesin2aligned)
|
|
|
|
cmp $32, %rdx
|
|
ja L(aligned_last_64_bytes)
|
|
|
|
movdqu -32(%rdi, %rdx), %xmm0
|
|
movdqu -32(%rsi, %rdx), %xmm1
|
|
CMPEQ %xmm0, %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
incw %ax
|
|
jnz L(vec_return_end_16)
|
|
|
|
movdqu -16(%rdi, %rdx), %xmm0
|
|
movdqu -16(%rsi, %rdx), %xmm1
|
|
CMPEQ %xmm0, %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
incw %ax
|
|
jnz L(vec_return_end)
|
|
ret
|
|
|
|
.p2align 4
|
|
L(aligned_loop):
|
|
# ifdef DATA_CACHE_SIZE_HALF
|
|
mov $DATA_CACHE_SIZE_HALF, %R8_LP
|
|
# else
|
|
mov __x86_data_cache_size_half(%rip), %R8_LP
|
|
# endif
|
|
movq %r8, %r9
|
|
addq %r8, %r8
|
|
addq %r9, %r8
|
|
cmpq %r8, %rdx
|
|
ja L(L2_L3_cache_aligned)
|
|
|
|
sub $64, %rdx
|
|
.p2align 4
|
|
L(64bytesormore_loopin2aligned):
|
|
movdqa (%rdi), %xmm0
|
|
movdqa 16(%rdi), %xmm1
|
|
movdqa 32(%rdi), %xmm2
|
|
movdqa 48(%rdi), %xmm3
|
|
|
|
CMPEQ (%rsi), %xmm0
|
|
CMPEQ 16(%rsi), %xmm1
|
|
CMPEQ 32(%rsi), %xmm2
|
|
CMPEQ 48(%rsi), %xmm3
|
|
|
|
pand %xmm0, %xmm1
|
|
pand %xmm2, %xmm3
|
|
pand %xmm1, %xmm3
|
|
|
|
pmovmskb %xmm3, %eax
|
|
incw %ax
|
|
jnz L(64bytesormore_loop_end)
|
|
add $64, %rsi
|
|
add $64, %rdi
|
|
sub $64, %rdx
|
|
ja L(64bytesormore_loopin2aligned)
|
|
jmp L(loop_tail)
|
|
|
|
L(L2_L3_cache_aligned):
|
|
subq $64, %rdx
|
|
.p2align 4
|
|
L(L2_L3_aligned_128bytes_loop):
|
|
prefetchnta 0x1c0(%rdi)
|
|
prefetchnta 0x1c0(%rsi)
|
|
movdqa (%rdi), %xmm0
|
|
movdqa 16(%rdi), %xmm1
|
|
movdqa 32(%rdi), %xmm2
|
|
movdqa 48(%rdi), %xmm3
|
|
|
|
CMPEQ (%rsi), %xmm0
|
|
CMPEQ 16(%rsi), %xmm1
|
|
CMPEQ 32(%rsi), %xmm2
|
|
CMPEQ 48(%rsi), %xmm3
|
|
|
|
pand %xmm0, %xmm1
|
|
pand %xmm2, %xmm3
|
|
pand %xmm1, %xmm3
|
|
|
|
pmovmskb %xmm3, %eax
|
|
incw %ax
|
|
jnz L(64bytesormore_loop_end)
|
|
|
|
addq $64, %rsi
|
|
addq $64, %rdi
|
|
subq $64, %rdx
|
|
ja L(L2_L3_aligned_128bytes_loop)
|
|
jmp L(loop_tail)
|
|
|
|
.p2align 4
|
|
L(64bytesormore_loop_end):
|
|
pmovmskb %xmm0, %ecx
|
|
incw %cx
|
|
jnz L(loop_end_ret)
|
|
|
|
pmovmskb %xmm1, %ecx
|
|
notw %cx
|
|
sall $16, %ecx
|
|
jnz L(loop_end_ret)
|
|
|
|
pmovmskb %xmm2, %ecx
|
|
notw %cx
|
|
shlq $32, %rcx
|
|
jnz L(loop_end_ret)
|
|
|
|
addq $48, %rdi
|
|
addq $48, %rsi
|
|
movq %rax, %rcx
|
|
|
|
.p2align 4,, 6
|
|
L(loop_end_ret):
|
|
bsfq %rcx, %rcx
|
|
# ifdef USE_AS_WMEMCMP
|
|
movl (%rdi, %rcx), %eax
|
|
xorl %edx, %edx
|
|
cmpl (%rsi, %rcx), %eax
|
|
setg %dl
|
|
leal -1(%rdx, %rdx), %eax
|
|
# else
|
|
movzbl (%rdi, %rcx), %eax
|
|
movzbl (%rsi, %rcx), %ecx
|
|
subl %ecx, %eax
|
|
# endif
|
|
ret
|
|
END (MEMCMP)
|
|
#endif
|