mirror of
git://sourceware.org/git/glibc.git
synced 2024-11-21 01:12:26 +08:00
x86: Optimize and shrink st{r|p}{n}{cat|cpy}-avx2 functions
Optimizations are: 1. Use more overlapping stores to avoid branches. 2. Reduce how unrolled the aligning copies are (this is more of a code-size save, its a negative for some sizes in terms of perf). 3. For st{r|p}n{cat|cpy} re-order the branches to minimize the number that are taken. Performance Changes: Times are from N = 10 runs of the benchmark suite and are reported as geometric mean of all ratios of New Implementation / Old Implementation. strcat-avx2 -> 0.998 strcpy-avx2 -> 0.937 stpcpy-avx2 -> 0.971 strncpy-avx2 -> 0.793 stpncpy-avx2 -> 0.775 strncat-avx2 -> 0.962 Code Size Changes: function -> Bytes New / Bytes Old -> Ratio strcat-avx2 -> 685 / 1639 -> 0.418 strcpy-avx2 -> 560 / 903 -> 0.620 stpcpy-avx2 -> 592 / 939 -> 0.630 strncpy-avx2 -> 1176 / 2390 -> 0.492 stpncpy-avx2 -> 1268 / 2438 -> 0.520 strncat-avx2 -> 1042 / 2563 -> 0.407 Notes: 1. Because of the significant difference between the implementations they are split into three files. strcpy-avx2.S -> strcpy, stpcpy, strcat strncpy-avx2.S -> strncpy strncat-avx2.S > strncat I couldn't find a way to merge them without making the ifdefs incredibly difficult to follow. Full check passes on x86-64 and build succeeds for all ISA levels w/ and w/o multiarch.
This commit is contained in:
parent
f049f52dfe
commit
642933158e
@ -1,3 +1,3 @@
|
||||
#define USE_AS_STPCPY
|
||||
#define STRCPY __stpcpy_avx2_rtm
|
||||
#include "strcpy-avx2-rtm.S"
|
||||
#define STPCPY __stpcpy_avx2_rtm
|
||||
#include "x86-avx-rtm-vecs.h"
|
||||
#include "stpcpy-avx2.S"
|
||||
|
@ -1,4 +1,3 @@
|
||||
#define USE_AS_STPCPY
|
||||
#define USE_AS_STRNCPY
|
||||
#define STRCPY __stpncpy_avx2_rtm
|
||||
#include "strcpy-avx2-rtm.S"
|
||||
#define STPNCPY __stpncpy_avx2_rtm
|
||||
#include "x86-avx-rtm-vecs.h"
|
||||
#include "stpncpy-avx2.S"
|
||||
|
@ -3,6 +3,5 @@
|
||||
#endif
|
||||
|
||||
#define USE_AS_STPCPY
|
||||
#define USE_AS_STRNCPY
|
||||
#define STRCPY STPNCPY
|
||||
#include "strcpy-avx2.S"
|
||||
#define STRNCPY STPNCPY
|
||||
#include "strncpy-avx2.S"
|
||||
|
@ -1,12 +1,3 @@
|
||||
#ifndef STRCAT
|
||||
# define STRCAT __strcat_avx2_rtm
|
||||
#endif
|
||||
|
||||
#define ZERO_UPPER_VEC_REGISTERS_RETURN \
|
||||
ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
|
||||
|
||||
#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
|
||||
|
||||
#define SECTION(p) p##.avx.rtm
|
||||
|
||||
#define STRCAT __strcat_avx2_rtm
|
||||
#include "x86-avx-rtm-vecs.h"
|
||||
#include "strcat-avx2.S"
|
||||
|
@ -16,266 +16,10 @@
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include <isa-level.h>
|
||||
|
||||
#if ISA_SHOULD_BUILD (3)
|
||||
|
||||
|
||||
# include <sysdep.h>
|
||||
|
||||
# ifndef STRCAT
|
||||
# define STRCAT __strcat_avx2
|
||||
# endif
|
||||
|
||||
# define USE_AS_STRCAT
|
||||
|
||||
/* Number of bytes in a vector register */
|
||||
# define VEC_SIZE 32
|
||||
|
||||
# ifndef SECTION
|
||||
# define SECTION(p) p##.avx
|
||||
# endif
|
||||
|
||||
.section SECTION(.text),"ax",@progbits
|
||||
ENTRY (STRCAT)
|
||||
mov %rdi, %r9
|
||||
# ifdef USE_AS_STRNCAT
|
||||
mov %rdx, %r8
|
||||
# endif
|
||||
|
||||
xor %eax, %eax
|
||||
mov %edi, %ecx
|
||||
and $((VEC_SIZE * 4) - 1), %ecx
|
||||
vpxor %xmm6, %xmm6, %xmm6
|
||||
cmp $(VEC_SIZE * 3), %ecx
|
||||
ja L(fourth_vector_boundary)
|
||||
vpcmpeqb (%rdi), %ymm6, %ymm0
|
||||
vpmovmskb %ymm0, %edx
|
||||
test %edx, %edx
|
||||
jnz L(exit_null_on_first_vector)
|
||||
mov %rdi, %rax
|
||||
and $-VEC_SIZE, %rax
|
||||
jmp L(align_vec_size_start)
|
||||
L(fourth_vector_boundary):
|
||||
mov %rdi, %rax
|
||||
and $-VEC_SIZE, %rax
|
||||
vpcmpeqb (%rax), %ymm6, %ymm0
|
||||
mov $-1, %r10d
|
||||
sub %rax, %rcx
|
||||
shl %cl, %r10d
|
||||
vpmovmskb %ymm0, %edx
|
||||
and %r10d, %edx
|
||||
jnz L(exit)
|
||||
|
||||
L(align_vec_size_start):
|
||||
vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm0
|
||||
vpmovmskb %ymm0, %edx
|
||||
test %edx, %edx
|
||||
jnz L(exit_null_on_second_vector)
|
||||
|
||||
vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
|
||||
vpmovmskb %ymm1, %edx
|
||||
test %edx, %edx
|
||||
jnz L(exit_null_on_third_vector)
|
||||
|
||||
vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
|
||||
vpmovmskb %ymm2, %edx
|
||||
test %edx, %edx
|
||||
jnz L(exit_null_on_fourth_vector)
|
||||
|
||||
vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
|
||||
vpmovmskb %ymm3, %edx
|
||||
test %edx, %edx
|
||||
jnz L(exit_null_on_fifth_vector)
|
||||
|
||||
vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
|
||||
add $(VEC_SIZE * 4), %rax
|
||||
vpmovmskb %ymm0, %edx
|
||||
test %edx, %edx
|
||||
jnz L(exit_null_on_second_vector)
|
||||
|
||||
vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
|
||||
vpmovmskb %ymm1, %edx
|
||||
test %edx, %edx
|
||||
jnz L(exit_null_on_third_vector)
|
||||
|
||||
vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
|
||||
vpmovmskb %ymm2, %edx
|
||||
test %edx, %edx
|
||||
jnz L(exit_null_on_fourth_vector)
|
||||
|
||||
vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
|
||||
vpmovmskb %ymm3, %edx
|
||||
test %edx, %edx
|
||||
jnz L(exit_null_on_fifth_vector)
|
||||
|
||||
vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
|
||||
add $(VEC_SIZE * 4), %rax
|
||||
vpmovmskb %ymm0, %edx
|
||||
test %edx, %edx
|
||||
jnz L(exit_null_on_second_vector)
|
||||
|
||||
vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
|
||||
vpmovmskb %ymm1, %edx
|
||||
test %edx, %edx
|
||||
jnz L(exit_null_on_third_vector)
|
||||
|
||||
vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
|
||||
vpmovmskb %ymm2, %edx
|
||||
test %edx, %edx
|
||||
jnz L(exit_null_on_fourth_vector)
|
||||
|
||||
vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
|
||||
vpmovmskb %ymm3, %edx
|
||||
test %edx, %edx
|
||||
jnz L(exit_null_on_fifth_vector)
|
||||
|
||||
vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
|
||||
add $(VEC_SIZE * 4), %rax
|
||||
vpmovmskb %ymm0, %edx
|
||||
test %edx, %edx
|
||||
jnz L(exit_null_on_second_vector)
|
||||
|
||||
vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
|
||||
vpmovmskb %ymm1, %edx
|
||||
test %edx, %edx
|
||||
jnz L(exit_null_on_third_vector)
|
||||
|
||||
vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
|
||||
vpmovmskb %ymm2, %edx
|
||||
test %edx, %edx
|
||||
jnz L(exit_null_on_fourth_vector)
|
||||
|
||||
vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
|
||||
vpmovmskb %ymm3, %edx
|
||||
test %edx, %edx
|
||||
jnz L(exit_null_on_fifth_vector)
|
||||
|
||||
test $((VEC_SIZE * 4) - 1), %rax
|
||||
jz L(align_four_vec_loop)
|
||||
|
||||
vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
|
||||
add $(VEC_SIZE * 5), %rax
|
||||
vpmovmskb %ymm0, %edx
|
||||
test %edx, %edx
|
||||
jnz L(exit)
|
||||
|
||||
test $((VEC_SIZE * 4) - 1), %rax
|
||||
jz L(align_four_vec_loop)
|
||||
|
||||
vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm1
|
||||
add $VEC_SIZE, %rax
|
||||
vpmovmskb %ymm1, %edx
|
||||
test %edx, %edx
|
||||
jnz L(exit)
|
||||
|
||||
test $((VEC_SIZE * 4) - 1), %rax
|
||||
jz L(align_four_vec_loop)
|
||||
|
||||
vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm2
|
||||
add $VEC_SIZE, %rax
|
||||
vpmovmskb %ymm2, %edx
|
||||
test %edx, %edx
|
||||
jnz L(exit)
|
||||
|
||||
test $((VEC_SIZE * 4) - 1), %rax
|
||||
jz L(align_four_vec_loop)
|
||||
|
||||
vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm3
|
||||
add $VEC_SIZE, %rax
|
||||
vpmovmskb %ymm3, %edx
|
||||
test %edx, %edx
|
||||
jnz L(exit)
|
||||
|
||||
add $VEC_SIZE, %rax
|
||||
|
||||
.p2align 4
|
||||
L(align_four_vec_loop):
|
||||
vmovaps (%rax), %ymm4
|
||||
vpminub VEC_SIZE(%rax), %ymm4, %ymm4
|
||||
vmovaps (VEC_SIZE * 2)(%rax), %ymm5
|
||||
vpminub (VEC_SIZE * 3)(%rax), %ymm5, %ymm5
|
||||
add $(VEC_SIZE * 4), %rax
|
||||
vpminub %ymm4, %ymm5, %ymm5
|
||||
vpcmpeqb %ymm5, %ymm6, %ymm5
|
||||
vpmovmskb %ymm5, %edx
|
||||
test %edx, %edx
|
||||
jz L(align_four_vec_loop)
|
||||
|
||||
vpcmpeqb -(VEC_SIZE * 4)(%rax), %ymm6, %ymm0
|
||||
sub $(VEC_SIZE * 5), %rax
|
||||
vpmovmskb %ymm0, %edx
|
||||
test %edx, %edx
|
||||
jnz L(exit_null_on_second_vector)
|
||||
|
||||
vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
|
||||
vpmovmskb %ymm1, %edx
|
||||
test %edx, %edx
|
||||
jnz L(exit_null_on_third_vector)
|
||||
|
||||
vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
|
||||
vpmovmskb %ymm2, %edx
|
||||
test %edx, %edx
|
||||
jnz L(exit_null_on_fourth_vector)
|
||||
|
||||
vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
|
||||
vpmovmskb %ymm3, %edx
|
||||
sub %rdi, %rax
|
||||
bsf %rdx, %rdx
|
||||
add %rdx, %rax
|
||||
add $(VEC_SIZE * 4), %rax
|
||||
jmp L(StartStrcpyPart)
|
||||
|
||||
.p2align 4
|
||||
L(exit):
|
||||
sub %rdi, %rax
|
||||
L(exit_null_on_first_vector):
|
||||
bsf %rdx, %rdx
|
||||
add %rdx, %rax
|
||||
jmp L(StartStrcpyPart)
|
||||
|
||||
.p2align 4
|
||||
L(exit_null_on_second_vector):
|
||||
sub %rdi, %rax
|
||||
bsf %rdx, %rdx
|
||||
add %rdx, %rax
|
||||
add $VEC_SIZE, %rax
|
||||
jmp L(StartStrcpyPart)
|
||||
|
||||
.p2align 4
|
||||
L(exit_null_on_third_vector):
|
||||
sub %rdi, %rax
|
||||
bsf %rdx, %rdx
|
||||
add %rdx, %rax
|
||||
add $(VEC_SIZE * 2), %rax
|
||||
jmp L(StartStrcpyPart)
|
||||
|
||||
.p2align 4
|
||||
L(exit_null_on_fourth_vector):
|
||||
sub %rdi, %rax
|
||||
bsf %rdx, %rdx
|
||||
add %rdx, %rax
|
||||
add $(VEC_SIZE * 3), %rax
|
||||
jmp L(StartStrcpyPart)
|
||||
|
||||
.p2align 4
|
||||
L(exit_null_on_fifth_vector):
|
||||
sub %rdi, %rax
|
||||
bsf %rdx, %rdx
|
||||
add %rdx, %rax
|
||||
add $(VEC_SIZE * 4), %rax
|
||||
|
||||
.p2align 4
|
||||
L(StartStrcpyPart):
|
||||
lea (%r9, %rax), %rdi
|
||||
mov %rsi, %rcx
|
||||
mov %r9, %rax /* save result */
|
||||
|
||||
# ifdef USE_AS_STRNCAT
|
||||
test %r8, %r8
|
||||
jz L(ExitZero)
|
||||
# define USE_AS_STRNCPY
|
||||
# endif
|
||||
|
||||
# include "strcpy-avx2.S"
|
||||
#ifndef STRCAT
|
||||
# define STRCAT __strcat_avx2
|
||||
#endif
|
||||
|
||||
#define USE_AS_STRCAT
|
||||
#define STRCPY STRCAT
|
||||
#include "strcpy-avx2.S"
|
||||
|
101
sysdeps/x86_64/multiarch/strcat-strlen-avx2.h.S
Normal file
101
sysdeps/x86_64/multiarch/strcat-strlen-avx2.h.S
Normal file
@ -0,0 +1,101 @@
|
||||
/* strlen used for begining of str{n}cat using AVX2.
|
||||
Copyright (C) 2011-2022 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
|
||||
/* NOTE: This file is meant to be included by strcat-avx2 or
|
||||
strncat-avx2 and does not standalone. Before including %rdi
|
||||
must be saved in %rax. */
|
||||
|
||||
|
||||
/* Simple strlen implementation that ends at
|
||||
L(strcat_strlen_done). */
|
||||
movq %rdi, %r8
|
||||
andq $(VEC_SIZE * -1), %r8
|
||||
VPCMPEQ (%r8), %VZERO, %VMM(0)
|
||||
vpmovmskb %VMM(0), %ecx
|
||||
shrxl %edi, %ecx, %ecx
|
||||
testl %ecx, %ecx
|
||||
jnz L(bsf_and_done_v0)
|
||||
|
||||
VPCMPEQ VEC_SIZE(%r8), %VZERO, %VMM(0)
|
||||
vpmovmskb %VMM(0), %ecx
|
||||
leaq (VEC_SIZE)(%r8), %rdi
|
||||
testl %ecx, %ecx
|
||||
jnz L(bsf_and_done_v0)
|
||||
|
||||
VPCMPEQ (VEC_SIZE * 1)(%rdi), %VZERO, %VMM(0)
|
||||
vpmovmskb %VMM(0), %ecx
|
||||
testl %ecx, %ecx
|
||||
jnz L(bsf_and_done_v1)
|
||||
|
||||
VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %VMM(0)
|
||||
vpmovmskb %VMM(0), %ecx
|
||||
testl %ecx, %ecx
|
||||
jnz L(bsf_and_done_v2)
|
||||
|
||||
VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %VMM(0)
|
||||
vpmovmskb %VMM(0), %ecx
|
||||
testl %ecx, %ecx
|
||||
jnz L(bsf_and_done_v3)
|
||||
|
||||
orq $(VEC_SIZE * 4 - 1), %rdi
|
||||
.p2align 4,, 8
|
||||
L(loop_2x_vec):
|
||||
VMOVA (VEC_SIZE * 0 + 1)(%rdi), %VMM(0)
|
||||
VPMIN (VEC_SIZE * 1 + 1)(%rdi), %VMM(0), %VMM(1)
|
||||
VMOVA (VEC_SIZE * 2 + 1)(%rdi), %VMM(2)
|
||||
VPMIN (VEC_SIZE * 3 + 1)(%rdi), %VMM(2), %VMM(3)
|
||||
VPMIN %VMM(1), %VMM(3), %VMM(3)
|
||||
VPCMPEQ %VMM(3), %VZERO, %VMM(3)
|
||||
vpmovmskb %VMM(3), %r8d
|
||||
subq $(VEC_SIZE * -4), %rdi
|
||||
testl %r8d, %r8d
|
||||
jz L(loop_2x_vec)
|
||||
|
||||
addq $(VEC_SIZE * -4 + 1), %rdi
|
||||
|
||||
VPCMPEQ %VMM(0), %VZERO, %VMM(0)
|
||||
vpmovmskb %VMM(0), %ecx
|
||||
testl %ecx, %ecx
|
||||
jnz L(bsf_and_done_v0)
|
||||
|
||||
VPCMPEQ %VMM(1), %VZERO, %VMM(1)
|
||||
vpmovmskb %VMM(1), %ecx
|
||||
testl %ecx, %ecx
|
||||
jnz L(bsf_and_done_v1)
|
||||
|
||||
VPCMPEQ %VMM(2), %VZERO, %VMM(2)
|
||||
vpmovmskb %VMM(2), %ecx
|
||||
testl %ecx, %ecx
|
||||
jnz L(bsf_and_done_v2)
|
||||
|
||||
movl %r8d, %ecx
|
||||
L(bsf_and_done_v3):
|
||||
addq $VEC_SIZE, %rdi
|
||||
L(bsf_and_done_v2):
|
||||
bsfl %ecx, %ecx
|
||||
leaq (VEC_SIZE * 2)(%rdi, %rcx), %rdi
|
||||
jmp L(strcat_strlen_done)
|
||||
|
||||
.p2align 4,, 4
|
||||
L(bsf_and_done_v1):
|
||||
addq $VEC_SIZE, %rdi
|
||||
L(bsf_and_done_v0):
|
||||
bsfl %ecx, %ecx
|
||||
addq %rcx, %rdi
|
||||
L(strcat_strlen_done):
|
@ -1,12 +1,3 @@
|
||||
#ifndef STRCPY
|
||||
# define STRCPY __strcpy_avx2_rtm
|
||||
#endif
|
||||
|
||||
#define ZERO_UPPER_VEC_REGISTERS_RETURN \
|
||||
ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
|
||||
|
||||
#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
|
||||
|
||||
#define SECTION(p) p##.avx.rtm
|
||||
|
||||
#define STRCPY __strcpy_avx2_rtm
|
||||
#include "x86-avx-rtm-vecs.h"
|
||||
#include "strcpy-avx2.S"
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,3 +1,3 @@
|
||||
#define USE_AS_STRNCAT
|
||||
#define STRCAT __strncat_avx2_rtm
|
||||
#include "strcat-avx2-rtm.S"
|
||||
#define STRNCAT __strncat_avx2_rtm
|
||||
#include "x86-avx-rtm-vecs.h"
|
||||
#include "strncat-avx2.S"
|
||||
|
@ -1,7 +1,419 @@
|
||||
#ifndef STRNCAT
|
||||
# define STRNCAT __strncat_avx2
|
||||
#endif
|
||||
/* strncat with AVX2
|
||||
Copyright (C) 2022 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
#define USE_AS_STRNCAT
|
||||
#define STRCAT STRNCAT
|
||||
#include "strcat-avx2.S"
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include <isa-level.h>
|
||||
|
||||
#if ISA_SHOULD_BUILD (3)
|
||||
|
||||
# include <sysdep.h>
|
||||
|
||||
# ifndef VEC_SIZE
|
||||
# include "x86-avx-vecs.h"
|
||||
# endif
|
||||
|
||||
# ifndef STRNCAT
|
||||
# define STRNCAT __strncat_avx2
|
||||
# endif
|
||||
|
||||
# ifdef USE_AS_WCSCPY
|
||||
# define MOVCHAR movl
|
||||
# define VPCMPEQ vpcmpeqd
|
||||
# define VPMIN vpminud
|
||||
# define CHAR_SIZE 4
|
||||
# else
|
||||
# define MOVCHAR movb
|
||||
# define VPCMPEQ vpcmpeqb
|
||||
# define VPMIN vpminub
|
||||
# define CHAR_SIZE 1
|
||||
# endif
|
||||
|
||||
# include "strncpy-or-cat-overflow-def.h"
|
||||
|
||||
# define PAGE_SIZE 4096
|
||||
|
||||
# define VZERO VMM(7)
|
||||
# define VZERO_128 VMM_128(7)
|
||||
|
||||
.section SECTION(.text), "ax", @progbits
|
||||
ENTRY(STRNCAT)
|
||||
/* Filter zero length strings and very long strings. Zero
|
||||
length strings just return, very long strings are handled by
|
||||
using the non-length variant {wcs|str}cat. */
|
||||
movq %rdi, %rax
|
||||
# ifdef USE_AS_WCSCPY
|
||||
leaq -1(%rdx), %rcx
|
||||
shr $56, %rcx
|
||||
jnz L(zero_len)
|
||||
salq $2, %rdx
|
||||
# else
|
||||
test %rdx, %rdx
|
||||
jl L(zero_len)
|
||||
# endif
|
||||
vpxor %VZERO_128, %VZERO_128, %VZERO_128
|
||||
|
||||
# include "strcat-strlen-avx2.h.S"
|
||||
|
||||
movl %esi, %ecx
|
||||
andl $(PAGE_SIZE - 1), %ecx
|
||||
cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
|
||||
ja L(page_cross)
|
||||
L(page_cross_continue):
|
||||
VMOVU (%rsi), %VMM(0)
|
||||
VPCMPEQ %VMM(0), %VZERO, %VMM(6)
|
||||
vpmovmskb %VMM(6), %ecx
|
||||
|
||||
tzcnt %ecx, %r8d
|
||||
cmpq %r8, %rdx
|
||||
jbe L(less_1x_vec)
|
||||
|
||||
testl %ecx, %ecx
|
||||
jz L(more_1x_vec)
|
||||
|
||||
/* Hoist this to save code size. */
|
||||
|
||||
movl %r8d, %edx
|
||||
|
||||
L(less_1x_vec):
|
||||
COND_VZEROUPPER
|
||||
|
||||
cmpl $16, %edx
|
||||
jae L(copy_16_31)
|
||||
cmpl $8, %edx
|
||||
jae L(copy_8_15)
|
||||
|
||||
|
||||
# ifdef USE_AS_WCSCPY
|
||||
vmovd %VMM_128(0), (%rdi)
|
||||
MOVCHAR $0, (%rdi, %rdx)
|
||||
ret
|
||||
# else
|
||||
cmpl $4, %edx
|
||||
jae L(copy_4_7)
|
||||
|
||||
movzbl (%rsi), %ecx
|
||||
cmpl $1, %edx
|
||||
jbe L(set_null_term)
|
||||
|
||||
/* NB: make this `vmovw` if support for AVX512-FP16 is added.
|
||||
*/
|
||||
movzwl 1(%rsi), %esi
|
||||
movw %si, 1(%rdi)
|
||||
|
||||
.p2align 4,, 1
|
||||
L(set_null_term):
|
||||
movb %cl, (%rdi)
|
||||
MOVCHAR $0, (%rdi, %rdx)
|
||||
ret
|
||||
|
||||
.p2align 4,, 11
|
||||
L(copy_4_7):
|
||||
movl -(4)(%rsi, %rdx), %ecx
|
||||
vmovd %xmm0, (%rdi)
|
||||
movl %ecx, -(4)(%rdi, %rdx)
|
||||
MOVCHAR $0, (%rdi, %rdx)
|
||||
ret
|
||||
# endif
|
||||
|
||||
|
||||
.p2align 4,, 10
|
||||
L(copy_16_31):
|
||||
VMOVU -(16)(%rsi, %rdx), %xmm1
|
||||
VMOVU %xmm0, (%rdi)
|
||||
VMOVU %xmm1, -(16)(%rdi, %rdx)
|
||||
MOVCHAR $0, (%rdi, %rdx)
|
||||
ret
|
||||
|
||||
.p2align 4,, 10
|
||||
L(copy_8_15):
|
||||
movq -(8)(%rsi, %rdx), %rcx
|
||||
vmovq %xmm0, (%rdi)
|
||||
movq %rcx, -(8)(%rdi, %rdx)
|
||||
MOVCHAR $0, (%rdi, %rdx)
|
||||
ret
|
||||
|
||||
.p2align 4,, 8
|
||||
.p2align 6,, 14
|
||||
L(more_1x_vec):
|
||||
VMOVU %VMM(0), (%rdi)
|
||||
|
||||
/* Align rsi (src) and just rdx/rdi (length/dst). */
|
||||
addq %rsi, %rdx
|
||||
subq %rsi, %rdi
|
||||
orq $(VEC_SIZE - 1), %rsi
|
||||
incq %rsi
|
||||
addq %rsi, %rdi
|
||||
L(loop_last_4x_vec):
|
||||
subq %rsi, %rdx
|
||||
VMOVA 0(%rsi), %VMM(1)
|
||||
VPCMPEQ %VMM(1), %VZERO, %VMM(6)
|
||||
vpmovmskb %VMM(6), %ecx
|
||||
cmpq $(VEC_SIZE * 2), %rdx
|
||||
ja L(more_2x_vec)
|
||||
L(last_2x_vec):
|
||||
tzcnt %ecx, %ecx
|
||||
cmpl %ecx, %edx
|
||||
jbe L(ret_vec_x1_len)
|
||||
|
||||
cmpl $VEC_SIZE, %ecx
|
||||
jnz L(ret_vec_x1)
|
||||
|
||||
VMOVA (VEC_SIZE * 1)(%rsi), %VMM(2)
|
||||
VMOVU %VMM(1), (%rdi)
|
||||
VPCMPEQ %VMM(2), %VZERO, %VMM(6)
|
||||
vpmovmskb %VMM(6), %ecx
|
||||
addl $-VEC_SIZE, %edx
|
||||
bzhil %edx, %ecx, %r8d
|
||||
jz L(ret_vec_x2_len)
|
||||
L(ret_vec_x2):
|
||||
bsfl %ecx, %edx
|
||||
L(ret_vec_x2_len):
|
||||
VMOVU (%rsi, %rdx), %VMM(0)
|
||||
MOVCHAR $0, (VEC_SIZE)(%rdi, %rdx)
|
||||
VMOVU %VMM(0), (%rdi, %rdx)
|
||||
L(return_vzeroupper):
|
||||
ZERO_UPPER_VEC_REGISTERS_RETURN
|
||||
|
||||
|
||||
.p2align 4,, 12
|
||||
L(ret_vec_x1_len):
|
||||
movl %edx, %ecx
|
||||
L(ret_vec_x1):
|
||||
VMOVU -(VEC_SIZE)(%rsi, %rcx), %VMM(1)
|
||||
MOVCHAR $0, (%rdi, %rcx)
|
||||
VMOVU %VMM(1), -VEC_SIZE(%rdi, %rcx)
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4,, 8
|
||||
L(last_4x_vec):
|
||||
subq $-(VEC_SIZE * 4), %rsi
|
||||
VMOVA 0(%rsi), %VMM(1)
|
||||
VPCMPEQ %VMM(1), %VZERO, %VMM(6)
|
||||
vpmovmskb %VMM(6), %ecx
|
||||
subq $-(VEC_SIZE * 4), %rdi
|
||||
addl $-(VEC_SIZE * 4), %edx
|
||||
cmpl $(VEC_SIZE * 2), %edx
|
||||
jbe L(last_2x_vec)
|
||||
.p2align 4,, 8
|
||||
L(more_2x_vec):
|
||||
/* L(ret_vec_x1) expects ecx to have position of first match so
|
||||
test with bsf. */
|
||||
bsfl %ecx, %ecx
|
||||
jnz L(ret_vec_x1)
|
||||
|
||||
VMOVA (VEC_SIZE * 1)(%rsi), %VMM(2)
|
||||
VMOVU %VMM(1), (%rdi)
|
||||
|
||||
VPCMPEQ %VMM(2), %VZERO, %VMM(6)
|
||||
vpmovmskb %VMM(6), %ecx
|
||||
testl %ecx, %ecx
|
||||
jnz L(ret_vec_x2)
|
||||
|
||||
|
||||
VMOVA (VEC_SIZE * 2)(%rsi), %VMM(3)
|
||||
VMOVU %VMM(2), (VEC_SIZE * 1)(%rdi)
|
||||
|
||||
VPCMPEQ %VMM(3), %VZERO, %VMM(6)
|
||||
vpmovmskb %VMM(6), %ecx
|
||||
|
||||
/* Check if length is greater than 4x VEC. */
|
||||
cmpq $(VEC_SIZE * 4), %rdx
|
||||
ja L(more_4x_vec)
|
||||
|
||||
addl $(VEC_SIZE * -2), %edx
|
||||
|
||||
tzcnt %ecx, %ecx
|
||||
cmpl %ecx, %edx
|
||||
jbe L(ret_vec_x3_len)
|
||||
|
||||
cmpl $VEC_SIZE, %ecx
|
||||
jnz L(ret_vec_x3)
|
||||
|
||||
VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(4)
|
||||
VMOVU %VMM(3), (VEC_SIZE * 2 + 0)(%rdi)
|
||||
VPCMPEQ %VMM(4), %VZERO, %VMM(6)
|
||||
vpmovmskb %VMM(6), %ecx
|
||||
addl $-VEC_SIZE, %edx
|
||||
bzhil %edx, %ecx, %r8d
|
||||
jz L(ret_vec_x4_len)
|
||||
L(ret_vec_x4):
|
||||
bsfl %ecx, %edx
|
||||
L(ret_vec_x4_len):
|
||||
VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %VMM(0)
|
||||
MOVCHAR $0, (VEC_SIZE * 3)(%rdi, %rdx)
|
||||
VMOVU %VMM(0), (VEC_SIZE * 2)(%rdi, %rdx)
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4,, 4
|
||||
L(ret_vec_x3_len):
|
||||
movl %edx, %ecx
|
||||
L(ret_vec_x3):
|
||||
VMOVU (VEC_SIZE)(%rsi, %rcx), %VMM(0)
|
||||
MOVCHAR $0, (VEC_SIZE * 2)(%rdi, %rcx)
|
||||
VMOVU %VMM(0), (VEC_SIZE)(%rdi, %rcx)
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
|
||||
.p2align 4,, 8
|
||||
L(more_4x_vec):
|
||||
bsfl %ecx, %ecx
|
||||
jnz L(ret_vec_x3)
|
||||
|
||||
VMOVA (VEC_SIZE * 3)(%rsi), %VMM(4)
|
||||
VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi)
|
||||
VPCMPEQ %VMM(4), %VZERO, %VMM(6)
|
||||
vpmovmskb %VMM(6), %ecx
|
||||
testl %ecx, %ecx
|
||||
jnz L(ret_vec_x4)
|
||||
|
||||
VMOVU %VMM(4), (VEC_SIZE * 3)(%rdi)
|
||||
|
||||
|
||||
/* Recheck length before aligning. */
|
||||
cmpq $(VEC_SIZE * 8), %rdx
|
||||
jbe L(last_4x_vec)
|
||||
|
||||
/* Align rsi (src) and just rdx/rdi (length/dst). */
|
||||
addq %rsi, %rdx
|
||||
subq %rsi, %rdi
|
||||
subq $-(VEC_SIZE * 4), %rsi
|
||||
andq $(VEC_SIZE * -4), %rsi
|
||||
|
||||
/* Do first half of loop ahead of time so loop can just start by
|
||||
storing. */
|
||||
VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
|
||||
VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
|
||||
VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
|
||||
VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
|
||||
|
||||
VPMIN %VMM(0), %VMM(1), %VMM(4)
|
||||
VPMIN %VMM(2), %VMM(3), %VMM(6)
|
||||
VPMIN %VMM(4), %VMM(6), %VMM(6)
|
||||
VPCMPEQ %VMM(6), %VZERO, %VMM(6)
|
||||
vpmovmskb %VMM(6), %r8d
|
||||
addq %rsi, %rdi
|
||||
testl %r8d, %r8d
|
||||
jnz L(loop_4x_done)
|
||||
|
||||
/* Use r9 for end of region before handling last 4x VEC
|
||||
specially. */
|
||||
leaq -(VEC_SIZE * 4)(%rdx), %r9
|
||||
|
||||
.p2align 4,, 11
|
||||
L(loop_4x_vec):
|
||||
|
||||
VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
|
||||
VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
|
||||
subq $(VEC_SIZE * -4), %rsi
|
||||
VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
|
||||
VMOVU %VMM(3), (VEC_SIZE * 3 + 0)(%rdi)
|
||||
|
||||
subq $(VEC_SIZE * -4), %rdi
|
||||
cmpq %rsi, %r9
|
||||
jbe L(loop_last_4x_vec)
|
||||
|
||||
VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
|
||||
VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
|
||||
VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
|
||||
VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
|
||||
|
||||
VPMIN %VMM(0), %VMM(1), %VMM(4)
|
||||
VPMIN %VMM(2), %VMM(3), %VMM(6)
|
||||
VPMIN %VMM(4), %VMM(6), %VMM(6)
|
||||
VPCMPEQ %VMM(6), %VZERO, %VMM(6)
|
||||
|
||||
vpmovmskb %VMM(6), %r8d
|
||||
|
||||
testl %r8d, %r8d
|
||||
jz L(loop_4x_vec)
|
||||
|
||||
L(loop_4x_done):
|
||||
VPCMPEQ %VMM(0), %VZERO, %VMM(6)
|
||||
vpmovmskb %VMM(6), %ecx
|
||||
/* L(ret_vec_x1) expects ecx to have position of first match so
|
||||
test with bsf. */
|
||||
bsfl %ecx, %ecx
|
||||
jnz L(ret_vec_x1)
|
||||
VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
|
||||
|
||||
VPCMPEQ %VMM(1), %VZERO, %VMM(6)
|
||||
vpmovmskb %VMM(6), %ecx
|
||||
|
||||
testl %ecx, %ecx
|
||||
jnz L(ret_vec_x2)
|
||||
VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
|
||||
|
||||
VPCMPEQ %VMM(2), %VZERO, %VMM(6)
|
||||
vpmovmskb %VMM(6), %ecx
|
||||
bsfl %ecx, %ecx
|
||||
jnz L(ret_vec_x3)
|
||||
|
||||
VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
|
||||
bsfl %r8d, %r8d
|
||||
VMOVU (VEC_SIZE * 2 + CHAR_SIZE)(%rsi, %r8), %VMM(1)
|
||||
VMOVU %VMM(1), (VEC_SIZE * 2 + CHAR_SIZE)(%rdi, %r8)
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
|
||||
|
||||
.p2align 4,, 4
|
||||
L(page_cross):
|
||||
movq %rsi, %r8
|
||||
andq $(VEC_SIZE * -1), %r8
|
||||
|
||||
VPCMPEQ (%r8), %VZERO, %VMM(6)
|
||||
|
||||
vpmovmskb %VMM(6), %ecx
|
||||
shrxl %esi, %ecx, %ecx
|
||||
|
||||
subl %esi, %r8d
|
||||
andl $(VEC_SIZE - 1), %r8d
|
||||
cmpq %r8, %rdx
|
||||
jb L(page_cross_small)
|
||||
|
||||
/* Optimizing more aggressively for space as this is very cold
|
||||
code. This saves 2x cache lines. */
|
||||
|
||||
/* This adds once to the later result which will get correct
|
||||
copy bounds. NB: this can never zero-out a non-zero RCX as
|
||||
to be in the page cross case rsi cannot be aligned and we
|
||||
already right-shift rcx by the misalignment. */
|
||||
shll $CHAR_SIZE, %ecx
|
||||
jz L(page_cross_continue)
|
||||
bsfl %ecx, %ecx
|
||||
rep movsb
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
L(page_cross_small):
|
||||
tzcntl %ecx, %ecx
|
||||
jz L(page_cross_setz)
|
||||
cmpl %edx, %ecx
|
||||
cmova %edx, %ecx
|
||||
rep movsb
|
||||
L(page_cross_setz):
|
||||
MOVCHAR $0, (%rdi)
|
||||
VZEROUPPER_RETURN
|
||||
L(zero_len):
|
||||
# ifdef USE_AS_WCSCPY
|
||||
test %rdx, %rdx
|
||||
# endif
|
||||
jnz OVERFLOW_STRCAT
|
||||
ret
|
||||
|
||||
|
||||
END(STRNCAT)
|
||||
#endif
|
||||
|
@ -1,3 +1,3 @@
|
||||
#define USE_AS_STRNCPY
|
||||
#define STRCPY __strncpy_avx2_rtm
|
||||
#include "strcpy-avx2-rtm.S"
|
||||
#define STRNCPY __strncpy_avx2_rtm
|
||||
#include "x86-avx-rtm-vecs.h"
|
||||
#include "strncpy-avx2.S"
|
||||
|
@ -1,7 +1,735 @@
|
||||
#ifndef STRNCPY
|
||||
# define STRNCPY __strncpy_avx2
|
||||
#endif
|
||||
/* strncpy with AVX2
|
||||
Copyright (C) 2022 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
#define USE_AS_STRNCPY
|
||||
#define STRCPY STRNCPY
|
||||
#include "strcpy-avx2.S"
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include <isa-level.h>
|
||||
|
||||
#if ISA_SHOULD_BUILD (3)
|
||||
|
||||
# include <sysdep.h>
|
||||
|
||||
|
||||
# ifndef VEC_SIZE
|
||||
# include "x86-avx-vecs.h"
|
||||
# endif
|
||||
|
||||
# ifndef STRNCPY
|
||||
# define STRNCPY __strncpy_avx2
|
||||
# endif
|
||||
|
||||
|
||||
# ifdef USE_AS_WCSCPY
|
||||
# define VPCMPEQ vpcmpeqd
|
||||
# define VPMIN vpminud
|
||||
# define CHAR_SIZE 4
|
||||
# else
|
||||
# define VPCMPEQ vpcmpeqb
|
||||
# define VPMIN vpminub
|
||||
# define CHAR_SIZE 1
|
||||
# endif
|
||||
|
||||
# include "strncpy-or-cat-overflow-def.h"
|
||||
|
||||
# define PAGE_SIZE 4096
|
||||
|
||||
# define VZERO VMM(7)
|
||||
# define VZERO_128 VMM_128(7)
|
||||
|
||||
|
||||
.section SECTION(.text), "ax", @progbits
|
||||
ENTRY(STRNCPY)
|
||||
/* Filter zero length strings and very long strings. Zero
|
||||
length strings just return, very long strings are handled by
|
||||
just running rep stos{b|l} to zero set (which will almost
|
||||
certainly segfault), if that succeeds then just calling
|
||||
OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy). */
|
||||
# ifdef USE_AS_WCSCPY
|
||||
decq %rdx
|
||||
movq %rdx, %rax
|
||||
/* 56 is end of max supported address space. */
|
||||
shr $56, %rax
|
||||
jnz L(zero_len)
|
||||
salq $2, %rdx
|
||||
# else
|
||||
decq %rdx
|
||||
/* `dec` can macrofuse with `jl`. If the flag needs to become
|
||||
`jb` replace `dec` with `sub`. */
|
||||
jl L(zero_len)
|
||||
# endif
|
||||
|
||||
vpxor %VZERO_128, %VZERO_128, %VZERO_128
|
||||
movl %esi, %eax
|
||||
andl $(PAGE_SIZE - 1), %eax
|
||||
cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
||||
ja L(page_cross)
|
||||
|
||||
L(page_cross_continue):
|
||||
VMOVU (%rsi), %VMM(0)
|
||||
VPCMPEQ %VMM(0), %VZERO, %VMM(6)
|
||||
vpmovmskb %VMM(6), %ecx
|
||||
|
||||
/* If no STPCPY just save end ahead of time. */
|
||||
# ifndef USE_AS_STPCPY
|
||||
movq %rdi, %rax
|
||||
# elif defined USE_AS_WCSCPY
|
||||
/* Clear dependency as nearly all return code for wcpncpy uses
|
||||
`setc %al`. */
|
||||
xorl %eax, %eax
|
||||
# endif
|
||||
|
||||
cmpq $(VEC_SIZE - CHAR_SIZE), %rdx
|
||||
/* `jb` because length rdx is now length - CHAR_SIZE. */
|
||||
jbe L(less_1x_vec)
|
||||
|
||||
/* This may overset but thats fine because we still need to zero
|
||||
fill. */
|
||||
VMOVU %VMM(0), (%rdi)
|
||||
|
||||
testl %ecx, %ecx
|
||||
jnz L(zfill)
|
||||
|
||||
/* Align. */
|
||||
addq %rsi, %rdx
|
||||
subq %rsi, %rdi
|
||||
orq $(VEC_SIZE - 1), %rsi
|
||||
incq %rsi
|
||||
L(last_4x_vec):
|
||||
addq %rsi, %rdi
|
||||
L(loop_last_4x_vec):
|
||||
subq %rsi, %rdx
|
||||
|
||||
|
||||
VMOVA 0(%rsi), %VMM(1)
|
||||
VPCMPEQ %VMM(1), %VZERO, %VMM(6)
|
||||
vpmovmskb %VMM(6), %ecx
|
||||
|
||||
cmpq $(VEC_SIZE * 2), %rdx
|
||||
jae L(more_2x_vec)
|
||||
|
||||
cmpl $(VEC_SIZE), %edx
|
||||
jb L(ret_vec_x1_len)
|
||||
|
||||
testl %ecx, %ecx
|
||||
jnz L(ret_vec_x1)
|
||||
|
||||
VPCMPEQ VEC_SIZE(%rsi), %VZERO, %VMM(6)
|
||||
VMOVU %VMM(1), (%rdi)
|
||||
vpmovmskb %VMM(6), %ecx
|
||||
shlq $VEC_SIZE, %rcx
|
||||
L(ret_vec_x1_len):
|
||||
tzcntq %rcx, %rcx
|
||||
cmpl %ecx, %edx
|
||||
jbe L(ret_vec_x1_len_no_zfill)
|
||||
/* Fall through (expectation) is copy len < buffer len. */
|
||||
VMOVU %VZERO, ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
|
||||
L(ret_vec_x1_len_no_zfill_mov):
|
||||
movl %ecx, %edx
|
||||
# ifdef USE_AS_STPCPY
|
||||
/* clear flags. */
|
||||
xorl %ecx, %ecx
|
||||
# endif
|
||||
L(ret_vec_x1_len_no_zfill):
|
||||
VMOVU ((0)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
|
||||
VMOVU %VMM(1), ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
|
||||
# ifdef USE_AS_STPCPY
|
||||
# ifdef USE_AS_WCSCPY
|
||||
setc %al
|
||||
addq %rdx, %rdi
|
||||
leaq (%rdi, %rax, CHAR_SIZE), %rax
|
||||
# else
|
||||
movl %edx, %eax
|
||||
adcq %rdi, %rax
|
||||
# endif
|
||||
# endif
|
||||
L(return_vzeroupper):
|
||||
ZERO_UPPER_VEC_REGISTERS_RETURN
|
||||
|
||||
.p2align 4,, 6
|
||||
L(ret_vec_x1):
|
||||
bsfl %ecx, %ecx
|
||||
VMOVU %VZERO, ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
|
||||
subl %ecx, %edx
|
||||
/* Check if we need to reload/store. */
|
||||
cmpl $VEC_SIZE, %edx
|
||||
jb L(ret_vec_x1_len_no_zfill_mov)
|
||||
/* Otherwise safe to just store directly. */
|
||||
VMOVU %VMM(1), (%rdi)
|
||||
VMOVU %VZERO, (%rdi, %rcx)
|
||||
# ifdef USE_AS_STPCPY
|
||||
leaq (%rdi, %rcx), %rax
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4,, 12
|
||||
L(more_2x_vec):
|
||||
VMOVU %VMM(1), (%rdi)
|
||||
testl %ecx, %ecx
|
||||
/* Must fill at least 2x VEC. */
|
||||
jnz L(zfill_vec1)
|
||||
|
||||
VMOVA VEC_SIZE(%rsi), %VMM(2)
|
||||
VMOVU %VMM(2), VEC_SIZE(%rdi)
|
||||
VPCMPEQ %VMM(2), %VZERO, %VMM(6)
|
||||
vpmovmskb %VMM(6), %ecx
|
||||
testl %ecx, %ecx
|
||||
/* Must fill at least 1x VEC. */
|
||||
jnz L(zfill_vec2)
|
||||
|
||||
VMOVA (VEC_SIZE * 2)(%rsi), %VMM(3)
|
||||
VPCMPEQ %VMM(3), %VZERO, %VMM(6)
|
||||
vpmovmskb %VMM(6), %ecx
|
||||
|
||||
/* Check if len is more 4x VEC. -CHAR_SIZE because rdx is len -
|
||||
CHAR_SIZE. */
|
||||
cmpq $(VEC_SIZE * 4 - CHAR_SIZE), %rdx
|
||||
ja L(more_4x_vec)
|
||||
|
||||
subl $(VEC_SIZE * 3), %edx
|
||||
jb L(ret_vec_x3_len)
|
||||
|
||||
testl %ecx, %ecx
|
||||
jnz L(ret_vec_x3)
|
||||
|
||||
VPCMPEQ (VEC_SIZE * 3)(%rsi), %VZERO, %VMM(6)
|
||||
VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi)
|
||||
vpmovmskb %VMM(6), %ecx
|
||||
tzcntl %ecx, %ecx
|
||||
cmpl %ecx, %edx
|
||||
jbe L(ret_vec_x4_len_no_zfill)
|
||||
/* Fall through (expectation) is copy len < buffer len. */
|
||||
VMOVU %VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
|
||||
movl %ecx, %edx
|
||||
L(ret_vec_x4_len_no_zfill):
|
||||
VMOVU ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
|
||||
VMOVU %VMM(1), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
|
||||
# ifdef USE_AS_STPCPY
|
||||
# ifdef USE_AS_WCSCPY
|
||||
setc %al
|
||||
addq %rdx, %rdi
|
||||
leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
# else
|
||||
leal (VEC_SIZE * 3 + 0)(%edx), %eax
|
||||
adcq %rdi, %rax
|
||||
# endif
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
|
||||
L(ret_vec_x3_len):
|
||||
addl $(VEC_SIZE * 1), %edx
|
||||
tzcntl %ecx, %ecx
|
||||
cmpl %ecx, %edx
|
||||
jbe L(ret_vec_x3_len_no_zfill)
|
||||
/* Fall through (expectation) is copy len < buffer len. */
|
||||
VMOVU %VZERO, ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
|
||||
L(ret_vec_x3_len_no_zfill_mov):
|
||||
movl %ecx, %edx
|
||||
# ifdef USE_AS_STPCPY
|
||||
/* clear flags. */
|
||||
xorl %ecx, %ecx
|
||||
# endif
|
||||
.p2align 4,, 4
|
||||
L(ret_vec_x3_len_no_zfill):
|
||||
VMOVU ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
|
||||
VMOVU %VMM(1), ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
|
||||
# ifdef USE_AS_STPCPY
|
||||
# ifdef USE_AS_WCSCPY
|
||||
setc %al
|
||||
addq %rdx, %rdi
|
||||
leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
# else
|
||||
leal (VEC_SIZE * 2 + 0)(%rdx), %eax
|
||||
adcq %rdi, %rax
|
||||
# endif
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
|
||||
.p2align 4,, 8
|
||||
L(ret_vec_x3):
|
||||
bsfl %ecx, %ecx
|
||||
VMOVU %VZERO, (VEC_SIZE * 3 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx)
|
||||
subl %ecx, %edx
|
||||
jl L(ret_vec_x3_len_no_zfill_mov)
|
||||
VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi)
|
||||
VMOVU %VZERO, (VEC_SIZE * 2)(%rdi, %rcx)
|
||||
# ifdef USE_AS_STPCPY
|
||||
leaq (VEC_SIZE * 2)(%rdi, %rcx), %rax
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4,, 8
|
||||
L(more_4x_vec):
|
||||
|
||||
VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi)
|
||||
testl %ecx, %ecx
|
||||
jnz L(zfill_vec3)
|
||||
|
||||
VMOVA (VEC_SIZE * 3)(%rsi), %VMM(4)
|
||||
VMOVU %VMM(4), (VEC_SIZE * 3)(%rdi)
|
||||
VPCMPEQ %VMM(4), %VZERO, %VMM(6)
|
||||
vpmovmskb %VMM(6), %ecx
|
||||
testl %ecx, %ecx
|
||||
jnz L(zfill_vec4)
|
||||
|
||||
movq %rdx, %rcx
|
||||
addq %rsi, %rdx
|
||||
subq %rsi, %rdi
|
||||
subq $-(VEC_SIZE * 4), %rsi
|
||||
/* Recheck length before aligning. */
|
||||
cmpq $(VEC_SIZE * 8 - CHAR_SIZE), %rcx
|
||||
jbe L(last_4x_vec)
|
||||
|
||||
andq $(VEC_SIZE * -4), %rsi
|
||||
|
||||
/* Do first half of loop ahead of time so loop can just start by
|
||||
storing. */
|
||||
VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
|
||||
VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
|
||||
VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
|
||||
VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
|
||||
|
||||
VPMIN %VMM(0), %VMM(1), %VMM(4)
|
||||
VPMIN %VMM(2), %VMM(3), %VMM(6)
|
||||
VPMIN %VMM(4), %VMM(6), %VMM(6)
|
||||
VPCMPEQ %VMM(6), %VZERO, %VMM(6)
|
||||
vpmovmskb %VMM(6), %r8d
|
||||
addq %rsi, %rdi
|
||||
testl %r8d, %r8d
|
||||
jnz L(loop_4x_done)
|
||||
|
||||
/* Use r9 as end register. */
|
||||
leaq -(VEC_SIZE * 4 - CHAR_SIZE)(%rdx), %r9
|
||||
|
||||
.p2align 4,, 11
|
||||
L(loop_4x_vec):
|
||||
|
||||
VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
|
||||
VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
|
||||
subq $(VEC_SIZE * -4), %rsi
|
||||
VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
|
||||
VMOVU %VMM(3), (VEC_SIZE * 3 + 0)(%rdi)
|
||||
|
||||
subq $(VEC_SIZE * -4), %rdi
|
||||
cmpq %rsi, %r9
|
||||
jbe L(loop_last_4x_vec)
|
||||
|
||||
VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
|
||||
VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
|
||||
VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
|
||||
VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
|
||||
|
||||
VPMIN %VMM(0), %VMM(1), %VMM(4)
|
||||
VPMIN %VMM(2), %VMM(3), %VMM(6)
|
||||
VPMIN %VMM(4), %VMM(6), %VMM(6)
|
||||
VPCMPEQ %VMM(6), %VZERO, %VMM(6)
|
||||
|
||||
vpmovmskb %VMM(6), %r8d
|
||||
|
||||
testl %r8d, %r8d
|
||||
jz L(loop_4x_vec)
|
||||
|
||||
L(loop_4x_done):
|
||||
subq %rsi, %rdx
|
||||
VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
|
||||
VPCMPEQ %VMM(0), %VZERO, %VMM(6)
|
||||
vpmovmskb %VMM(6), %ecx
|
||||
testl %ecx, %ecx
|
||||
jnz L(zfill_vec1)
|
||||
|
||||
VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
|
||||
VPCMPEQ %VMM(1), %VZERO, %VMM(6)
|
||||
vpmovmskb %VMM(6), %ecx
|
||||
testl %ecx, %ecx
|
||||
jnz L(zfill_vec2)
|
||||
|
||||
VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
|
||||
VPCMPEQ %VMM(2), %VZERO, %VMM(6)
|
||||
vpmovmskb %VMM(6), %ecx
|
||||
testl %ecx, %ecx
|
||||
jnz L(zfill_vec3)
|
||||
|
||||
VMOVU %VMM(3), (VEC_SIZE * 3 + 0)(%rdi)
|
||||
movl %r8d, %ecx
|
||||
|
||||
// Zfill more....
|
||||
|
||||
.p2align 4,, 4
|
||||
L(zfill_vec4):
|
||||
addq $(VEC_SIZE * 2), %rdi
|
||||
subq $(VEC_SIZE * 2), %rdx
|
||||
L(zfill_vec2):
|
||||
shlq $VEC_SIZE, %rcx
|
||||
L(zfill):
|
||||
bsfq %rcx, %rcx
|
||||
subq %rcx, %rdx
|
||||
addq %rcx, %rdi
|
||||
# ifdef USE_AS_STPCPY
|
||||
movq %rdi, %rax
|
||||
# endif
|
||||
L(zfill_from_page_cross):
|
||||
cmpq $VEC_SIZE, %rdx
|
||||
jb L(zfill_less_vec_vzeroupper)
|
||||
|
||||
L(zfill_more_1x_vec):
|
||||
VMOVU %VZERO, CHAR_SIZE(%rdi)
|
||||
VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx)
|
||||
cmpq $(VEC_SIZE * 2), %rdx
|
||||
jae L(zfill_more_2x_vec)
|
||||
L(zfill_done0):
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4,, 8
|
||||
L(zfill_vec3):
|
||||
addq $(VEC_SIZE * 2), %rdi
|
||||
subq $(VEC_SIZE * 2), %rdx
|
||||
.p2align 4,, 2
|
||||
L(zfill_vec1):
|
||||
bsfl %ecx, %ecx
|
||||
addq %rcx, %rdi
|
||||
subq %rcx, %rdx
|
||||
# ifdef USE_AS_STPCPY
|
||||
movq %rdi, %rax
|
||||
# endif
|
||||
/* zfill from vec1/vec3 must have to set at least 2x VECS. */
|
||||
|
||||
VMOVU %VZERO, CHAR_SIZE(%rdi)
|
||||
VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx)
|
||||
cmpq $(VEC_SIZE * 2), %rdx
|
||||
jb L(zfill_done0)
|
||||
L(zfill_more_2x_vec):
|
||||
VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx)
|
||||
VMOVU %VZERO, (VEC_SIZE + CHAR_SIZE)(%rdi)
|
||||
subq $(VEC_SIZE * 4 - CHAR_SIZE), %rdx
|
||||
jbe L(zfill_done)
|
||||
|
||||
addq %rdi, %rdx
|
||||
VMOVU %VZERO, (VEC_SIZE * 2 + CHAR_SIZE)(%rdi)
|
||||
VMOVU %VZERO, (VEC_SIZE * 3 + CHAR_SIZE)(%rdi)
|
||||
|
||||
|
||||
VMOVU %VZERO, (VEC_SIZE * 0 + 0)(%rdx)
|
||||
VMOVU %VZERO, (VEC_SIZE * 1 + 0)(%rdx)
|
||||
|
||||
subq $-(VEC_SIZE * 4 + CHAR_SIZE), %rdi
|
||||
cmpq %rdi, %rdx
|
||||
jbe L(zfill_done)
|
||||
|
||||
andq $-(VEC_SIZE), %rdi
|
||||
.p2align 4,, 12
|
||||
L(zfill_loop_4x_vec):
|
||||
VMOVA %VZERO, (VEC_SIZE * 0)(%rdi)
|
||||
VMOVA %VZERO, (VEC_SIZE * 1)(%rdi)
|
||||
VMOVA %VZERO, (VEC_SIZE * 2)(%rdi)
|
||||
VMOVA %VZERO, (VEC_SIZE * 3)(%rdi)
|
||||
subq $-(VEC_SIZE * 4), %rdi
|
||||
cmpq %rdi, %rdx
|
||||
ja L(zfill_loop_4x_vec)
|
||||
L(zfill_done):
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
|
||||
.p2align 4,, 8
|
||||
L(copy_1x):
|
||||
VMOVU %VMM(0), (%rdi)
|
||||
testl %ecx, %ecx
|
||||
jz L(ret_32_32)
|
||||
L(zfill_less_vec):
|
||||
bsfl %ecx, %ecx
|
||||
L(zfill_less_vec_no_bsf):
|
||||
subq %rcx, %rdx
|
||||
addq %rcx, %rdi
|
||||
# ifdef USE_AS_STPCPY
|
||||
movq %rdi, %rax
|
||||
# endif
|
||||
L(zfill_less_vec_vzeroupper):
|
||||
COND_VZEROUPPER
|
||||
/* We are taking advantage of the fact that to be here we must
|
||||
be writing null-term as (%rdi, %rcx) we have a byte of lee-
|
||||
way for overwriting. */
|
||||
cmpl $16, %edx
|
||||
jb L(zfill_less_16)
|
||||
VMOVU %VZERO_128, (%rdi)
|
||||
VMOVU %VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx)
|
||||
ret
|
||||
# ifdef USE_AS_STPCPY
|
||||
L(ret_32_32):
|
||||
leaq CHAR_SIZE(%rdi, %rdx), %rax
|
||||
VZEROUPPER_RETURN
|
||||
# endif
|
||||
|
||||
.p2align 4,, 4
|
||||
L(copy_16_31):
|
||||
/* Overfill to avoid branches. */
|
||||
vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx), %xmm1
|
||||
vmovdqu %xmm0, (%rdi)
|
||||
vmovdqu %xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx)
|
||||
cmpl %ecx, %edx
|
||||
ja L(zfill_less_vec_no_bsf)
|
||||
# ifndef USE_AS_STPCPY
|
||||
L(ret_32_32):
|
||||
# else
|
||||
# ifdef USE_AS_WCSCPY
|
||||
setc %al
|
||||
addq %rdx, %rdi
|
||||
leaq (%rdi, %rax, CHAR_SIZE), %rax
|
||||
# else
|
||||
movl %edx, %eax
|
||||
adcq %rdi, %rax
|
||||
# endif
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4,, 4
|
||||
L(copy_8_15):
|
||||
/* Overfill to avoid branches. */
|
||||
movq -(8 - CHAR_SIZE)(%rsi, %rdx), %rsi
|
||||
vmovq %xmm0, (%rdi)
|
||||
movq %rsi, -(8 - CHAR_SIZE)(%rdi, %rdx)
|
||||
cmpl %ecx, %edx
|
||||
jbe L(ret_8_15)
|
||||
subq %rcx, %rdx
|
||||
addq %rcx, %rdi
|
||||
# ifdef USE_AS_STPCPY
|
||||
movq %rdi, %rax
|
||||
# endif
|
||||
.p2align 4,, 8
|
||||
L(zfill_less_16):
|
||||
xorl %ecx, %ecx
|
||||
cmpl $8, %edx
|
||||
jb L(zfill_less_8)
|
||||
movq %rcx, (%rdi)
|
||||
movq %rcx, -(8 - CHAR_SIZE)(%rdi, %rdx)
|
||||
# ifndef USE_AS_STPCPY
|
||||
L(ret_8_15):
|
||||
# endif
|
||||
ret
|
||||
|
||||
|
||||
.p2align 4,, 8
|
||||
L(less_1x_vec):
|
||||
/* Reuse flag from `cmp $VEC_SIZE, %rdx`. The idea is many
|
||||
buffer sizes are aligned conventially. */
|
||||
je L(copy_1x)
|
||||
|
||||
tzcntl %ecx, %ecx
|
||||
cmpl $16, %edx
|
||||
jae L(copy_16_31)
|
||||
|
||||
COND_VZEROUPPER
|
||||
cmpl $8, %edx
|
||||
jae L(copy_8_15)
|
||||
# ifdef USE_AS_WCSCPY
|
||||
testl %ecx, %ecx
|
||||
jz L(zfill_less_8_set_ret)
|
||||
|
||||
movl (%rsi, %rdx), %esi
|
||||
vmovd %xmm0, (%rdi)
|
||||
movl %esi, (%rdi, %rdx)
|
||||
|
||||
# ifdef USE_AS_STPCPY
|
||||
cmpl %ecx, %edx
|
||||
L(ret_8_15):
|
||||
setc %al
|
||||
addq %rdx, %rdi
|
||||
leaq (%rdi, %rax, CHAR_SIZE), %rax
|
||||
# endif
|
||||
ret
|
||||
L(zfill_less_8_set_ret):
|
||||
xorl %ecx, %ecx
|
||||
# ifdef USE_AS_STPCPY
|
||||
movq %rdi, %rax
|
||||
# endif
|
||||
L(zfill_less_8):
|
||||
movl %ecx, (%rdi)
|
||||
movl %ecx, (%rdi, %rdx)
|
||||
ret
|
||||
|
||||
# else
|
||||
cmpl $3, %edx
|
||||
jb L(copy_0_3)
|
||||
/* Overfill to avoid branches. */
|
||||
movl -3(%rsi, %rdx), %esi
|
||||
vmovd %xmm0, (%rdi)
|
||||
movl %esi, -3(%rdi, %rdx)
|
||||
cmpl %ecx, %edx
|
||||
jbe L(ret_4_7)
|
||||
subq %rcx, %rdx
|
||||
addq %rcx, %rdi
|
||||
# ifdef USE_AS_STPCPY
|
||||
movq %rdi, %rax
|
||||
# endif
|
||||
xorl %ecx, %ecx
|
||||
.p2align 4,, 8
|
||||
L(zfill_less_8):
|
||||
cmpl $3, %edx
|
||||
jb L(zfill_less_3)
|
||||
movl %ecx, (%rdi)
|
||||
movl %ecx, -3(%rdi, %rdx)
|
||||
# ifdef USE_AS_STPCPY
|
||||
ret
|
||||
# endif
|
||||
|
||||
L(ret_4_7):
|
||||
# ifdef USE_AS_STPCPY
|
||||
L(ret_8_15):
|
||||
movl %edx, %eax
|
||||
adcq %rdi, %rax
|
||||
# endif
|
||||
ret
|
||||
|
||||
.p2align 4,, 4
|
||||
L(zfill_less_3):
|
||||
testl %edx, %edx
|
||||
jz L(zfill_1)
|
||||
movw %cx, (%rdi)
|
||||
L(zfill_1):
|
||||
movb %cl, (%rdi, %rdx)
|
||||
ret
|
||||
|
||||
.p2align 4,, 8
|
||||
L(copy_0_3):
|
||||
vmovd %xmm0, %r8d
|
||||
testl %edx, %edx
|
||||
jz L(copy_1)
|
||||
movw %r8w, (%rdi)
|
||||
cmpl %ecx, %edx
|
||||
ja L(zfill_from_1)
|
||||
movzbl (%rsi, %rdx), %r8d
|
||||
# ifdef USE_AS_STPCPY
|
||||
movl %edx, %eax
|
||||
adcq %rdi, %rax
|
||||
movb %r8b, (%rdi, %rdx)
|
||||
ret
|
||||
# endif
|
||||
|
||||
L(copy_1):
|
||||
# ifdef USE_AS_STPCPY
|
||||
movl %edx, %eax
|
||||
cmpl %ecx, %edx
|
||||
adcq %rdi, %rax
|
||||
# endif
|
||||
# ifdef USE_AS_WCSCPY
|
||||
vmovd %xmm0, (%rdi)
|
||||
# else
|
||||
movb %r8b, (%rdi, %rdx)
|
||||
# endif
|
||||
ret
|
||||
# endif
|
||||
|
||||
.p2align 4,, 2
|
||||
L(zero_len):
|
||||
movq %rdi, %rax
|
||||
ret
|
||||
# ifndef USE_AS_WCSCPY
|
||||
.p2align 4,, 8
|
||||
L(zfill_from_1):
|
||||
# ifdef USE_AS_STPCPY
|
||||
leaq (%rdi, %rcx), %rax
|
||||
# endif
|
||||
movw $0, -1(%rdi, %rdx)
|
||||
ret
|
||||
# endif
|
||||
|
||||
.p2align 4,, 4
|
||||
.p2align 6,, 8
|
||||
L(page_cross):
|
||||
movq %rsi, %rax
|
||||
andq $(VEC_SIZE * -1), %rax
|
||||
|
||||
VPCMPEQ (%rax), %VZERO, %VMM(6)
|
||||
|
||||
vpmovmskb %VMM(6), %ecx
|
||||
shrxl %esi, %ecx, %ecx
|
||||
|
||||
subl %esi, %eax
|
||||
andl $(VEC_SIZE - 1), %eax
|
||||
cmpq %rax, %rdx
|
||||
jb L(page_cross_small)
|
||||
/* Optimizing more aggressively for space as this is very cold
|
||||
code. This saves 2x cache lines. */
|
||||
|
||||
/* If rcx is non-zero then continue. */
|
||||
shl $CHAR_SIZE, %ecx
|
||||
jz L(page_cross_continue)
|
||||
bsf %ecx, %ecx
|
||||
|
||||
subq %rcx, %rdx
|
||||
# ifdef USE_AS_STPCPY
|
||||
leaq -CHAR_SIZE(%rdi, %rcx), %rax
|
||||
# else
|
||||
movq %rdi, %rax
|
||||
# endif
|
||||
|
||||
rep movsb
|
||||
# ifdef USE_AS_WCSCPY
|
||||
movl $0, (%rdi)
|
||||
# else
|
||||
movb $0, (%rdi)
|
||||
# endif
|
||||
jmp L(zfill_from_page_cross)
|
||||
|
||||
L(page_cross_small):
|
||||
tzcntl %ecx, %ecx
|
||||
xorl %eax, %eax
|
||||
cmpl %ecx, %edx
|
||||
jbe L(page_cross_copy_only)
|
||||
|
||||
/* Do a zfill of the tail before copying. */
|
||||
movq %rdi, %r9
|
||||
movl %ecx, %r8d
|
||||
|
||||
subl %ecx, %edx
|
||||
leaq CHAR_SIZE(%rdi, %rcx), %rdi
|
||||
movl %edx, %ecx
|
||||
rep stosb
|
||||
movq %r9, %rdi
|
||||
movl %r8d, %edx
|
||||
L(page_cross_copy_only):
|
||||
leal CHAR_SIZE(%rdx), %ecx
|
||||
# ifdef USE_AS_STPCPY
|
||||
# ifdef USE_AS_WCSCPY
|
||||
setc %al
|
||||
addq %rdi, %rdx
|
||||
leaq (%rdx, %rax, CHAR_SIZE), %rax
|
||||
# else
|
||||
movl %edx, %eax
|
||||
adcq %rdi, %rax
|
||||
# endif
|
||||
# else
|
||||
movq %rdi, %rax
|
||||
# endif
|
||||
rep movsb
|
||||
ret
|
||||
|
||||
|
||||
L(best_effort_strncpy):
|
||||
movq %rdx, %rcx
|
||||
xorl %eax, %eax
|
||||
movq %rdi, %r8
|
||||
/* The length is >= 2^63. We very much so expect to segfault at
|
||||
rep stos. If that doesn't happen then just strcpy to finish.
|
||||
*/
|
||||
# ifdef USE_AS_WCSCPY
|
||||
rep stosl
|
||||
# else
|
||||
rep stosb
|
||||
# endif
|
||||
movq %r8, %rdi
|
||||
jmp OVERFLOW_STRCPY
|
||||
END(STRNCPY)
|
||||
#endif
|
||||
|
@ -27,7 +27,8 @@
|
||||
#define VEC_SIZE 32
|
||||
#include "x86-vec-macros.h"
|
||||
|
||||
#define USE_WITH_AVX 1
|
||||
#define USE_WITH_AVX2 1
|
||||
|
||||
#define SECTION(p) p##.avx
|
||||
|
||||
/* 4-byte mov instructions with AVX2. */
|
||||
|
Loading…
Reference in New Issue
Block a user