x86: Optimize and shrink st{r|p}{n}{cat|cpy}-evex functions

Optimizations are:
    1. Use more overlapping stores to avoid branches.
    2. Reduce how unrolled the aligning copies are (this is more of a
       code-size save, its a negative for some sizes in terms of
       perf).
    3. Improve the loop a bit (similiar to what we do in strlen with
       2x vpminu + kortest instead of 3x vpminu + kmov + test).
    4. For st{r|p}n{cat|cpy} re-order the branches to minimize the
       number that are taken.

Performance Changes:

    Times are from N = 10 runs of the benchmark suite and are
    reported as geometric mean of all ratios of
    New Implementation / Old Implementation.

    stpcpy-evex      -> 0.922
    strcat-evex      -> 0.985
    strcpy-evex      -> 0.880

    strncpy-evex     -> 0.831
    stpncpy-evex     -> 0.780

    strncat-evex     -> 0.958

Code Size Changes:
    function         -> Bytes New / Bytes Old -> Ratio

    strcat-evex      ->  819 / 1874 -> 0.437
    strcpy-evex      ->  700 / 1074 -> 0.652
    stpcpy-evex      ->  735 / 1094 -> 0.672

    strncpy-evex     -> 1397 / 2611 -> 0.535
    stpncpy-evex     -> 1489 / 2691 -> 0.553

    strncat-evex     -> 1184 / 2832 -> 0.418

Notes:
    1. Because of the significant difference between the
       implementations they are split into three files.

           strcpy-evex.S    -> strcpy, stpcpy, strcat
           strncpy-evex.S   -> strncpy
           strncat-evex.S    > strncat

       I couldn't find a way to merge them without making the
       ifdefs incredibly difficult to follow.

    2. All implementations can be made evex512 by including
       "x86-evex512-vecs.h" at the top.

    3. All implementations have an optional define:
        `USE_EVEX_MASKED_STORE`
       Setting to one uses evex-masked stores for handling short
       strings.  This saves code size and branches.  It's disabled
       for all implementations are the moment as there are some
       serious drawbacks to masked stores in certain cases, but
       that may be fixed on future architectures.

Full check passes on x86-64 and build succeeds for all ISA levels w/
and w/o multiarch.
This commit is contained in:
Noah Goldstein 2022-11-08 17:38:38 -08:00
parent d44e116428
commit f049f52dfe
7 changed files with 2187 additions and 1245 deletions

View File

@ -3,6 +3,5 @@
#endif
#define USE_AS_STPCPY
#define USE_AS_STRNCPY
#define STRCPY STPNCPY
#include "strcpy-evex.S"
#define STRNCPY STPNCPY
#include "strncpy-evex.S"

View File

@ -1,286 +1,7 @@
/* strcat with 256-bit EVEX instructions.
Copyright (C) 2021-2022 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <isa-level.h>
#if ISA_SHOULD_BUILD (4)
# include <sysdep.h>
# ifndef STRCAT
# define STRCAT __strcat_evex
# endif
# define VMOVU vmovdqu64
# define VMOVA vmovdqa64
/* zero register */
# define XMMZERO xmm16
# define YMMZERO ymm16
# define YMM0 ymm17
# define YMM1 ymm18
# define USE_AS_STRCAT
/* Number of bytes in a vector register */
# define VEC_SIZE 32
.section .text.evex,"ax",@progbits
ENTRY (STRCAT)
mov %rdi, %r9
# ifdef USE_AS_STRNCAT
mov %rdx, %r8
# endif
xor %eax, %eax
mov %edi, %ecx
and $((VEC_SIZE * 4) - 1), %ecx
vpxorq %XMMZERO, %XMMZERO, %XMMZERO
cmp $(VEC_SIZE * 3), %ecx
ja L(fourth_vector_boundary)
vpcmpb $0, (%rdi), %YMMZERO, %k0
kmovd %k0, %edx
test %edx, %edx
jnz L(exit_null_on_first_vector)
mov %rdi, %rax
and $-VEC_SIZE, %rax
jmp L(align_vec_size_start)
L(fourth_vector_boundary):
mov %rdi, %rax
and $-VEC_SIZE, %rax
vpcmpb $0, (%rax), %YMMZERO, %k0
mov $-1, %r10d
sub %rax, %rcx
shl %cl, %r10d
kmovd %k0, %edx
and %r10d, %edx
jnz L(exit)
L(align_vec_size_start):
vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0
kmovd %k0, %edx
test %edx, %edx
jnz L(exit_null_on_second_vector)
vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
kmovd %k1, %edx
test %edx, %edx
jnz L(exit_null_on_third_vector)
vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
kmovd %k2, %edx
test %edx, %edx
jnz L(exit_null_on_fourth_vector)
vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
kmovd %k3, %edx
test %edx, %edx
jnz L(exit_null_on_fifth_vector)
vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
add $(VEC_SIZE * 4), %rax
kmovd %k4, %edx
test %edx, %edx
jnz L(exit_null_on_second_vector)
vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
kmovd %k1, %edx
test %edx, %edx
jnz L(exit_null_on_third_vector)
vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
kmovd %k2, %edx
test %edx, %edx
jnz L(exit_null_on_fourth_vector)
vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
kmovd %k3, %edx
test %edx, %edx
jnz L(exit_null_on_fifth_vector)
vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
kmovd %k4, %edx
add $(VEC_SIZE * 4), %rax
test %edx, %edx
jnz L(exit_null_on_second_vector)
vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
kmovd %k1, %edx
test %edx, %edx
jnz L(exit_null_on_third_vector)
vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
kmovd %k2, %edx
test %edx, %edx
jnz L(exit_null_on_fourth_vector)
vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
kmovd %k3, %edx
test %edx, %edx
jnz L(exit_null_on_fifth_vector)
vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
add $(VEC_SIZE * 4), %rax
kmovd %k4, %edx
test %edx, %edx
jnz L(exit_null_on_second_vector)
vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
kmovd %k1, %edx
test %edx, %edx
jnz L(exit_null_on_third_vector)
vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
kmovd %k2, %edx
test %edx, %edx
jnz L(exit_null_on_fourth_vector)
vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
kmovd %k3, %edx
test %edx, %edx
jnz L(exit_null_on_fifth_vector)
test $((VEC_SIZE * 4) - 1), %rax
jz L(align_four_vec_loop)
vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
add $(VEC_SIZE * 5), %rax
kmovd %k4, %edx
test %edx, %edx
jnz L(exit)
test $((VEC_SIZE * 4) - 1), %rax
jz L(align_four_vec_loop)
vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0
add $VEC_SIZE, %rax
kmovd %k0, %edx
test %edx, %edx
jnz L(exit)
test $((VEC_SIZE * 4) - 1), %rax
jz L(align_four_vec_loop)
vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0
add $VEC_SIZE, %rax
kmovd %k0, %edx
test %edx, %edx
jnz L(exit)
test $((VEC_SIZE * 4) - 1), %rax
jz L(align_four_vec_loop)
vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k1
add $VEC_SIZE, %rax
kmovd %k1, %edx
test %edx, %edx
jnz L(exit)
add $VEC_SIZE, %rax
.p2align 4
L(align_four_vec_loop):
VMOVA (%rax), %YMM0
VMOVA (VEC_SIZE * 2)(%rax), %YMM1
vpminub VEC_SIZE(%rax), %YMM0, %YMM0
vpminub (VEC_SIZE * 3)(%rax), %YMM1, %YMM1
vpminub %YMM0, %YMM1, %YMM0
/* If K0 != 0, there is a null byte. */
vpcmpb $0, %YMM0, %YMMZERO, %k0
add $(VEC_SIZE * 4), %rax
ktestd %k0, %k0
jz L(align_four_vec_loop)
vpcmpb $0, -(VEC_SIZE * 4)(%rax), %YMMZERO, %k0
sub $(VEC_SIZE * 5), %rax
kmovd %k0, %edx
test %edx, %edx
jnz L(exit_null_on_second_vector)
vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
kmovd %k1, %edx
test %edx, %edx
jnz L(exit_null_on_third_vector)
vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
kmovd %k2, %edx
test %edx, %edx
jnz L(exit_null_on_fourth_vector)
vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
kmovd %k3, %edx
sub %rdi, %rax
bsf %rdx, %rdx
add %rdx, %rax
add $(VEC_SIZE * 4), %rax
jmp L(StartStrcpyPart)
.p2align 4
L(exit):
sub %rdi, %rax
L(exit_null_on_first_vector):
bsf %rdx, %rdx
add %rdx, %rax
jmp L(StartStrcpyPart)
.p2align 4
L(exit_null_on_second_vector):
sub %rdi, %rax
bsf %rdx, %rdx
add %rdx, %rax
add $VEC_SIZE, %rax
jmp L(StartStrcpyPart)
.p2align 4
L(exit_null_on_third_vector):
sub %rdi, %rax
bsf %rdx, %rdx
add %rdx, %rax
add $(VEC_SIZE * 2), %rax
jmp L(StartStrcpyPart)
.p2align 4
L(exit_null_on_fourth_vector):
sub %rdi, %rax
bsf %rdx, %rdx
add %rdx, %rax
add $(VEC_SIZE * 3), %rax
jmp L(StartStrcpyPart)
.p2align 4
L(exit_null_on_fifth_vector):
sub %rdi, %rax
bsf %rdx, %rdx
add %rdx, %rax
add $(VEC_SIZE * 4), %rax
.p2align 4
L(StartStrcpyPart):
lea (%r9, %rax), %rdi
mov %rsi, %rcx
mov %r9, %rax /* save result */
# ifdef USE_AS_STRNCAT
test %r8, %r8
jz L(ExitZero)
# define USE_AS_STRNCPY
# endif
# include "strcpy-evex.S"
#ifndef STRCAT
# define STRCAT __strcat_evex
#endif
#define USE_AS_STRCAT
#define STRCPY STRCAT
#include "strcpy-evex.S"

View File

@ -0,0 +1,110 @@
/* strlen used for begining of str{n}cat using EVEX 256/512.
Copyright (C) 2011-2022 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
/* NOTE: This file is meant to be included by strcat-evex or
strncat-evex and does not standalone. Before including %rdi
must be saved in %rax. */
/* Simple strlen implementation that ends at
L(strcat_strlen_done). */
vpxorq %VZERO_128, %VZERO_128, %VZERO_128
movq %rdi, %r8
andq $(VEC_SIZE * -1), %r8
VPCMPEQ (%r8), %VZERO, %k0
KMOV %k0, %VRCX
#ifdef USE_AS_WCSCPY
subl %r8d, %edi
shrl $2, %edi
#endif
shrx %VRDI, %VRCX, %VRCX
#ifdef USE_AS_WCSCPY
movq %rax, %rdi
#endif
test %VRCX, %VRCX
jnz L(bsf_and_done_v0)
VPCMPEQ VEC_SIZE(%r8), %VZERO, %k0
KMOV %k0, %VRCX
leaq (VEC_SIZE)(%r8), %rdi
test %VRCX, %VRCX
jnz L(bsf_and_done_v0)
VPCMPEQ (VEC_SIZE * 2)(%r8), %VZERO, %k0
KMOV %k0, %VRCX
test %VRCX, %VRCX
jnz L(bsf_and_done_v1)
VPCMPEQ (VEC_SIZE * 3)(%r8), %VZERO, %k0
KMOV %k0, %VRCX
test %VRCX, %VRCX
jnz L(bsf_and_done_v2)
VPCMPEQ (VEC_SIZE * 4)(%r8), %VZERO, %k0
KMOV %k0, %VRCX
test %VRCX, %VRCX
jnz L(bsf_and_done_v3)
andq $-(VEC_SIZE * 4), %rdi
.p2align 4,, 8
L(loop_2x_vec):
VMOVA (VEC_SIZE * 4)(%rdi), %VMM(0)
VPMIN (VEC_SIZE * 5)(%rdi), %VMM(0), %VMM(1)
VMOVA (VEC_SIZE * 6)(%rdi), %VMM(2)
VPMIN (VEC_SIZE * 7)(%rdi), %VMM(2), %VMM(3)
VPTESTN %VMM(1), %VMM(1), %k1
VPTESTN %VMM(3), %VMM(3), %k3
subq $(VEC_SIZE * -4), %rdi
KORTEST %k1, %k3
jz L(loop_2x_vec)
VPTESTN %VMM(0), %VMM(0), %k0
KMOV %k0, %VRCX
test %VRCX, %VRCX
jnz L(bsf_and_done_v0)
KMOV %k1, %VRCX
test %VRCX, %VRCX
jnz L(bsf_and_done_v1)
VPTESTN %VMM(2), %VMM(2), %k0
KMOV %k0, %VRCX
test %VRCX, %VRCX
jnz L(bsf_and_done_v2)
KMOV %k3, %VRCX
L(bsf_and_done_v3):
addq $VEC_SIZE, %rdi
L(bsf_and_done_v2):
bsf %VRCX, %VRCX
leaq (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rdi
jmp L(strcat_strlen_done)
.p2align 4,, 4
L(bsf_and_done_v1):
addq $VEC_SIZE, %rdi
L(bsf_and_done_v0):
bsf %VRCX, %VRCX
#ifdef USE_AS_WCSCPY
leaq (%rdi, %rcx, CHAR_SIZE), %rdi
#else
addq %rcx, %rdi
#endif
L(strcat_strlen_done):

File diff suppressed because it is too large Load Diff

View File

@ -1,7 +1,520 @@
#ifndef STRNCAT
# define STRNCAT __strncat_evex
#endif
/* {wcs|str}ncat with 256/512-bit EVEX.
Copyright (C) 2022 Free Software Foundation, Inc.
This file is part of the GNU C Library.
#define USE_AS_STRNCAT
#define STRCAT STRNCAT
#include "strcat-evex.S"
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <isa-level.h>
#if ISA_SHOULD_BUILD (4)
/* Use evex-masked stores for small sizes. Turned off at the
moment. */
# define USE_EVEX_MASKED_STORE 0
# include <sysdep.h>
# ifndef VEC_SIZE
# include "x86-evex256-vecs.h"
# endif
# ifndef STRNCAT
# define STRNCAT __strncat_evex
# endif
# ifdef USE_AS_WCSCPY
# define MOVCHAR movl
# define VMOVU_MASK vmovdqu32
# define VPMIN vpminud
# define VPTESTN vptestnmd
# define VPTEST vptestmd
# define VPCMPEQ vpcmpeqd
# define CHAR_SIZE 4
# define REP_MOVS rep movsd
# define VMASK_REG VR10
# define FIND_FIRST_ONE(src, dst) movl $CHAR_PER_VEC, %dst; bsf %src, %dst
# define USE_WIDE_CHAR
# else
# define MOVCHAR movb
# define VMOVU_MASK vmovdqu8
# define VPMIN vpminub
# define VPTESTN vptestnmb
# define VPTEST vptestmb
# define VPCMPEQ vpcmpeqb
# define CHAR_SIZE 1
# define REP_MOVS rep movsb
# define VMASK_REG VRCX
# define FIND_FIRST_ONE(src, dst) tzcnt %src, %dst
# endif
# include "strncpy-or-cat-overflow-def.h"
# include "reg-macros.h"
# define VZERO VMM(7)
# define VZERO_128 VMM_128(7)
# define PAGE_SIZE 4096
# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
.section SECTION(.text), "ax", @progbits
ENTRY(STRNCAT)
movq %rdi, %rax
/* NB: It's safe to filter out zero-length strings WITHOUT
setting null-term. Destination MUST be a null-terminated
string so essentially the work is already done. */
# ifdef USE_AS_WCSCPY
leaq -1(%rdx), %rcx
shrq $56, %rcx
jnz L(zero_len)
# else
test %rdx, %rdx
jle L(zero_len)
# endif
# include "strcat-strlen-evex.h.S"
movl %esi, %ecx
andl $(PAGE_SIZE - 1), %ecx
cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
ja L(page_cross)
L(page_cross_continue):
VMOVU (%rsi), %VMM(0)
VPTESTN %VMM(0), %VMM(0), %k0
/* If USE_EVEX_MASK_STORE is enabled then we just handle length
<= CHAR_PER_VEC with masked instructions (which have
potential for dramatically bad perf if dst splits a page and
is not in the TLB). */
# if USE_EVEX_MASKED_STORE
KMOV %k0, %VRCX
FIND_FIRST_ONE (VRCX, VR8)
cmpq %r8, %rdx
jbe L(less_1x_vec)
test %VRCX, %VRCX
jz L(more_1x_vec)
blsmsk %VRCX, %VRCX
KMOV %VRCX, %k1
VMOVU_MASK %VMM(0), (%rdi){%k1}
ret
L(less_1x_vec):
mov $-1, %VRCX
bzhi %VRDX, %VRCX, %VRCX
KMOV %VRCX, %k1
MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE)
VMOVU_MASK %VMM(0), (%rdi){%k1}
ret
# else
KMOV %k0, %VMASK_REG
/* tzcnt for strncat and `movl $CHAR_PER_VEC, %VRCX; bsf
%VMASK_REG, %VRCX` for wcsncat. */
FIND_FIRST_ONE (VMASK_REG, VRCX)
cmpq %rcx, %rdx
jbe L(less_1x_vec)
/* If there were no zero-CHARs (rcx was zero before
FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC. */
cmpl $CHAR_PER_VEC, %ecx
je L(more_1x_vec)
movl %ecx, %edx
L(less_1x_vec):
# if VEC_SIZE == 64
cmpl $(32 / CHAR_SIZE), %edx
jae L(copy_32_63)
# endif
cmpl $(16 / CHAR_SIZE), %edx
jae L(copy_16_31)
cmpl $(8 / CHAR_SIZE), %edx
jae L(copy_8_15)
# ifdef USE_AS_WCSCPY
vmovd %VMM_128(0), (%rdi)
MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE)
ret
# else
cmpl $4, %edx
jae L(copy_4_7)
movzbl (%rsi), %ecx
cmpl $1, %edx
jbe L(set_null_term)
movzwl 1(%rsi), %esi
movw %si, 1(%rdi)
.p2align 4,, 1
L(set_null_term):
movb %cl, (%rdi)
MOVCHAR $0, (%rdi, %rdx)
ret
# endif
# if VEC_SIZE == 64
.p2align 4,, 6
L(copy_32_63):
VMOVU -(32)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
VMOVU %VMM_256(0), (%rdi)
VMOVU %VMM_256(1), -(32)(%rdi, %rdx, CHAR_SIZE)
MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE)
ret
# endif
.p2align 4,, 6
L(copy_16_31):
/* Use xmm1 explicitly here as it won't require a `vzeroupper`
and will save code size. */
vmovdqu -(16)(%rsi, %rdx, CHAR_SIZE), %xmm1
VMOVU %VMM_128(0), (%rdi)
vmovdqu %xmm1, -(16)(%rdi, %rdx, CHAR_SIZE)
MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE)
ret
.p2align 4,, 2
L(copy_8_15):
movq -(8)(%rsi, %rdx, CHAR_SIZE), %rcx
vmovq %VMM_128(0), (%rdi)
movq %rcx, -(8)(%rdi, %rdx, CHAR_SIZE)
MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE)
ret
# ifndef USE_AS_WCSCPY
.p2align 4,, 12
L(copy_4_7):
movl -(4)(%rsi, %rdx, CHAR_SIZE), %ecx
vmovd %VMM_128(0), (%rdi)
movl %ecx, -(4)(%rdi, %rdx, CHAR_SIZE)
MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE)
ret
# endif
# endif
.p2align 4,, 4
L(zero_len):
# ifdef USE_AS_WCSCPY
test %rdx, %rdx
# endif
jne OVERFLOW_STRCAT
ret
.p2align 4,, 8
L(more_1x_vec):
VMOVU %VMM(0), (%rdi)
/* We are going to align rsi here so will need to be able to re-
adjust rdi/rdx afterwords. NB: We filtered out huge lengths
so rsi + rdx * CHAR_SIZE cannot overflow. */
leaq (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
subq %rsi, %rdi
andq $-(VEC_SIZE), %rsi
L(loop_last_4x_vec):
addq %rsi, %rdi
subq %rsi, %rdx
# ifdef USE_AS_WCSCPY
shrq $2, %rdx
# endif
/* Will need this regardless. */
VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1)
VPTESTN %VMM(1), %VMM(1), %k0
KMOV %k0, %VMASK_REG
cmpq $(CHAR_PER_VEC * 2), %rdx
ja L(more_2x_vec)
L(last_2x_vec):
FIND_FIRST_ONE (VMASK_REG, VRCX)
cmpl %ecx, %edx
jbe L(ret_vec_x1_len)
/* If there were no zero-CHARs (rcx was zero before
FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC. */
cmpl $CHAR_PER_VEC, %ecx
jne L(ret_vec_x1)
VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2)
VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi)
VPTESTN %VMM(2), %VMM(2), %k0
KMOV %k0, %VRCX
addl $-CHAR_PER_VEC, %edx
bzhi %VRDX, %VRCX, %VR8
jz L(ret_vec_x2_len)
L(ret_vec_x2):
bsf %VRCX, %VRDX
L(ret_vec_x2_len):
VMOVU (VEC_SIZE * 2 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
MOVCHAR $0, (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
VMOVU %VMM(0), (VEC_SIZE * 2 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE)
ret
.p2align 4,, 4
L(ret_vec_x1_len):
movl %edx, %ecx
L(ret_vec_x1):
VMOVU (VEC_SIZE -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
MOVCHAR $0, (VEC_SIZE)(%rdi, %rcx, CHAR_SIZE)
VMOVU %VMM(0), (VEC_SIZE-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
VZEROUPPER_RETURN
.p2align 4,, 8
L(last_4x_vec):
addl $-(CHAR_PER_VEC * 4), %edx
VMOVA (VEC_SIZE * 5)(%rsi), %VMM(1)
VPTESTN %VMM(1), %VMM(1), %k0
KMOV %k0, %VMASK_REG
subq $-(VEC_SIZE * 4), %rsi
subq $-(VEC_SIZE * 4), %rdi
cmpl $(CHAR_PER_VEC * 2), %edx
jbe L(last_2x_vec)
.p2align 4,, 8
L(more_2x_vec):
# ifdef USE_AS_WCSCPY
xorl %ecx, %ecx
# endif
bsf %VMASK_REG, %VRCX
jnz L(ret_vec_x1)
VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2)
VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi)
VPTESTN %VMM(2), %VMM(2), %k0
KMOV %k0, %VRCX
test %VRCX, %VRCX
jnz L(ret_vec_x2)
VMOVA (VEC_SIZE * 3)(%rsi), %VMM(3)
VMOVU %VMM(2), (VEC_SIZE * 2)(%rdi)
VPTESTN %VMM(3), %VMM(3), %k0
KMOV %k0, %VMASK_REG
cmpq $(CHAR_PER_VEC * 4), %rdx
ja L(more_4x_vec)
/* Adjust length before going to L(ret_vec_x3_len) or
L(ret_vec_x3). */
addl $(CHAR_PER_VEC * -2), %edx
FIND_FIRST_ONE (VMASK_REG, VRCX)
cmpl %ecx, %edx
jbe L(ret_vec_x3_len)
/* If there were no zero-CHARs (rcx was zero before
FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC. */
cmpl $CHAR_PER_VEC, %ecx
jne L(ret_vec_x3)
VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4)
VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi)
VPTESTN %VMM(4), %VMM(4), %k0
KMOV %k0, %VRCX
addl $-CHAR_PER_VEC, %edx
bzhi %VRDX, %VRCX, %VR8
jz L(ret_vec_x4_len)
L(ret_vec_x4):
bsf %VRCX, %VRDX
L(ret_vec_x4_len):
VMOVU (VEC_SIZE * 4 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
MOVCHAR $0, (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE)
VMOVU %VMM(0), (VEC_SIZE * 4 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE)
ret
.p2align 4,, 4
L(ret_vec_x3_len):
movl %edx, %ecx
L(ret_vec_x3):
VMOVU (VEC_SIZE * 3 -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
MOVCHAR $0, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE)
VMOVU %VMM(0), (VEC_SIZE * 3-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
ret
.p2align 4,, 8
L(more_4x_vec):
# ifdef USE_AS_WCSCPY
xorl %ecx, %ecx
# endif
bsf %VMASK_REG, %VRCX
jnz L(ret_vec_x3)
VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4)
VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi)
VPTESTN %VMM(4), %VMM(4), %k0
KMOV %k0, %VRCX
test %VRCX, %VRCX
jnz L(ret_vec_x4)
VMOVU %VMM(4), (VEC_SIZE * 4)(%rdi)
/* Check if we are near the end before aligning. */
cmpq $(CHAR_PER_VEC * 8), %rdx
jbe L(last_4x_vec)
/* Add rsi to rdx (length) before aligning rsi. NB: Since we
filtered out huge lengths this cannot overflow. */
# ifdef USE_AS_WCSCPY
leaq (%rsi, %rdx, CHAR_SIZE), %rdx
# else
addq %rsi, %rdx
# endif
/* Subtract rsi from rdi before aligning (add back will have
correct rdi for aligned rsi). */
subq %rsi, %rdi
subq $-(VEC_SIZE * 5), %rsi
andq $(VEC_SIZE * -4), %rsi
/* Load first half of the loop before entry. */
VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
VPMIN %VMM(0), %VMM(1), %VMM(4)
VPMIN %VMM(2), %VMM(3), %VMM(6)
VPTESTN %VMM(4), %VMM(4), %k2
VPTESTN %VMM(6), %VMM(6), %k4
/* Offset rsi by VEC_SIZE so that we can jump to
L(loop_last_4x_vec). */
addq $-(VEC_SIZE), %rsi
KORTEST %k2, %k4
jnz L(loop_4x_done)
/* Store loop end in r9. */
leaq -(VEC_SIZE * 5)(%rdx), %r9
.p2align 4,, 11
L(loop_4x_vec):
VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
VMOVU %VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi)
subq $(VEC_SIZE * -4), %rsi
cmpq %rsi, %r9
jbe L(loop_last_4x_vec)
VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(0)
VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(1)
VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(2)
VMOVA (VEC_SIZE * 4 + 0)(%rsi), %VMM(3)
VPMIN %VMM(0), %VMM(1), %VMM(4)
VPMIN %VMM(2), %VMM(3), %VMM(6)
VPTESTN %VMM(4), %VMM(4), %k2
VPTESTN %VMM(6), %VMM(6), %k4
KORTEST %k2, %k4
jz L(loop_4x_vec)
L(loop_4x_done):
VPTESTN %VMM(0), %VMM(0), %k0
KMOV %k0, %VRCX
/* Restore rdi (dst). */
addq %rsi, %rdi
/* L(ret_vec_x1) expects rcx to have position of zero-CHAR so
test with bsf. */
bsf %VRCX, %VRCX
jnz L(ret_vec_x1)
VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi)
KMOV %k2, %VRCX
test %VRCX, %VRCX
jnz L(ret_vec_x2)
VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi)
VPTESTN %VMM(2), %VMM(2), %k0
KMOV %k0, %VRCX
bsf %VRCX, %VRCX
jnz L(ret_vec_x3)
VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi)
KMOV %k4, %VRCX
bsf %VRCX, %VRCX
VMOVU ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
VMOVU %VMM(0), ((VEC_SIZE * 4 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
ret
.p2align 4,, 4
L(page_cross):
movq %rsi, %r8
andq $(VEC_SIZE * -1), %r8
VPCMPEQ (%r8), %VZERO, %k0
# ifdef USE_AS_WCSCPY
KMOV %k0, %VR9
shrl $2, %ecx
andl $(CHAR_PER_VEC - 1), %ecx
shrx %VRCX, %VR9, %VRCX
# else
KMOV %k0, %VRCX
shrx %VRSI, %VRCX, %VRCX
# endif
subl %esi, %r8d
andl $(VEC_SIZE - 1), %r8d
# ifdef USE_AS_WCSCPY
shrl $2, %r8d
# endif
cmpq %r8, %rdx
jbe L(page_cross_small)
/* Optimizing more for space as this is very cold code. This
saves 2x cache lines. */
/* This adds once to the later result which will get correct
copy bounds. NB: this can never zero-out a non-zero RCX as
to be in the page cross case rsi cannot be aligned and we
already right-shift rcx by the misalignment. */
shl %VRCX
jz L(page_cross_continue)
bsf %VRCX, %VRCX
REP_MOVS
ret
L(page_cross_small):
tzcnt %VRCX, %VRCX
jz L(page_cross_setz)
cmpl %edx, %ecx
cmova %edx, %ecx
# ifdef USE_AS_WCSCPY
rep movsd
# else
rep movsb
# endif
L(page_cross_setz):
MOVCHAR $0, (%rdi)
ret
END(STRNCAT)
#endif

View File

@ -1,7 +1,990 @@
#ifndef STRNCPY
# define STRNCPY __strncpy_evex
#endif
/* {wcs|wcp|str|stp}ncpy with 256/512-bit EVEX instructions.
Copyright (C) 2022 Free Software Foundation, Inc.
This file is part of the GNU C Library.
#define USE_AS_STRNCPY
#define STRCPY STRNCPY
#include "strcpy-evex.S"
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <isa-level.h>
#if ISA_SHOULD_BUILD (4)
/* Use evex-masked stores for small sizes. Turned off at the
moment. */
# define USE_EVEX_MASKED_STORE 0
# include <sysdep.h>
# ifndef VEC_SIZE
# include "x86-evex256-vecs.h"
# endif
# ifndef STRNCPY
# define STRNCPY __strncpy_evex
# endif
# ifdef USE_AS_WCSCPY
# define VMOVU_MASK vmovdqu32
# define VPCMPEQ vpcmpeqd
# define VPMIN vpminud
# define VPTESTN vptestnmd
# define VPTEST vptestmd
# define CHAR_SIZE 4
# define REP_MOVS rep movsd
# define REP_STOS rep stosl
# define USE_WIDE_CHAR
# else
# define VMOVU_MASK vmovdqu8
# define VPCMPEQ vpcmpeqb
# define VPMIN vpminub
# define VPTESTN vptestnmb
# define VPTEST vptestmb
# define CHAR_SIZE 1
# define REP_MOVS rep movsb
# define REP_STOS rep stosb
# endif
# include "strncpy-or-cat-overflow-def.h"
# define PAGE_SIZE 4096
# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
# include "reg-macros.h"
# define VZERO VMM(7)
# define VZERO_256 VMM_256(7)
# define VZERO_128 VMM_128(7)
# if VEC_SIZE == 64
# define VZERO_HALF VZERO_256
# else
# define VZERO_HALF VZERO_128
# endif
.section SECTION(.text), "ax", @progbits
ENTRY(STRNCPY)
/* Filter zero length strings and very long strings. Zero
length strings just return, very long strings are handled by
just running rep stos{b|l} to zero set (which will almost
certainly segfault), if that succeeds then just calling
OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy). */
# ifdef USE_AS_WCSCPY
decq %rdx
movq %rdx, %rax
/* 56 is end of max supported address space. */
shr $56, %rax
jnz L(zero_len)
# else
decq %rdx
/* If the flag needs to become `jb` replace `dec` with `sub`.
*/
jl L(zero_len)
# endif
vpxorq %VZERO_128, %VZERO_128, %VZERO_128
movl %esi, %eax
andl $(PAGE_SIZE - 1), %eax
cmpl $(PAGE_SIZE - VEC_SIZE), %eax
ja L(page_cross)
L(page_cross_continue):
VMOVU (%rsi), %VMM(0)
VPTESTN %VMM(0), %VMM(0), %k0
KMOV %k0, %VRCX
/* If no STPCPY just save end ahead of time. */
# ifndef USE_AS_STPCPY
movq %rdi, %rax
# endif
cmpq $(CHAR_PER_VEC), %rdx
/* If USE_EVEX_MASK_STORE is enabled then we just handle length
<= CHAR_PER_VEC with masked instructions (which have
potential for dramatically bad perf if dst splits a page and
is not in the TLB). */
# if USE_EVEX_MASKED_STORE
/* `jae` because length rdx is now length - 1. */
jae L(more_1x_vec)
/* If there where multiple zero-CHAR matches in the first VEC,
VRCX will be overset but thats fine since any oversets where
at zero-positions anyways. */
# ifdef USE_AS_STPCPY
tzcnt %VRCX, %VRAX
cmpl %eax, %edx
cmovb %edx, %eax
# ifdef USE_AS_WCSCPY
adcl $0, %eax
leaq (%rdi, %rax, CHAR_SIZE), %rax
# else
adcq %rdi, %rax
# endif
# endif
dec %VRCX
/* Zero out all non-zero CHAR's after the first zero match. */
KMOV %VRCX, %k1
/* Use VZERO as destination so this can be reused for
L(zfill_less_vec) (which if jumped to by subsequent logic
will have zerod out VZERO. */
VMOVU_MASK %VMM(0), %VZERO{%k1}{z}
L(zfill_less_vec):
/* Get mask for what we need to set. */
incl %edx
mov $-1, %VRCX
bzhi %VRDX, %VRCX, %VRCX
KMOV %VRCX, %k1
VMOVU_MASK %VZERO, (%rdi){%k1}
ret
.p2align 4,, 4
L(zero_len):
cmpq $-1, %rdx
jne L(best_effort_strncpy)
movq %rdi, %rax
ret
.p2align 4,, 8
L(more_1x_vec):
# else
/* `jb` because length rdx is now length - 1. */
jb L(less_1x_vec)
# endif
/* This may overset but thats fine because we still need to zero
fill. */
VMOVU %VMM(0), (%rdi)
/* Length must be >= CHAR_PER_VEC so match here means we must
zero-fill. */
test %VRCX, %VRCX
jnz L(zfill)
/* We are going to align rsi here so will need to be able to re-
adjust rdi/rdx afterwords. NB: We filtered out huge lengths
so rsi + rdx * CHAR_SIZE cannot overflow. */
leaq (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
subq %rsi, %rdi
andq $-(VEC_SIZE), %rsi
L(loop_last_4x_vec):
addq %rsi, %rdi
subq %rsi, %rdx
# ifdef USE_AS_WCSCPY
shrq $2, %rdx
# endif
VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1)
VPTESTN %VMM(1), %VMM(1), %k0
KMOV %k0, %VRCX
/* -1 because of the `dec %rdx` earlier. */
cmpq $(CHAR_PER_VEC * 2 - 1), %rdx
ja L(more_2x_vec)
L(last_2x_vec):
/* This will be need to be computed no matter what. We do it
ahead of time for CHAR_PER_VEC == 64 because we can't adjust
the value of `tzcnt` with a shift. */
# if CHAR_PER_VEC == 64
tzcntq %rcx, %rcx
# endif
cmpl $(CHAR_PER_VEC), %edx
jb L(ret_vec_x1_len)
/* Seperate logic for CHAR_PER_VEC == 64 because we already did
`tzcnt` on VRCX. */
# if CHAR_PER_VEC == 64
/* cl == CHAR_PER_VEC iff it was zero before the `tzcnt`. */
cmpb $CHAR_PER_VEC, %cl
jnz L(ret_vec_x1_no_bsf)
# else
test %VRCX, %VRCX
jnz L(ret_vec_x1)
# endif
VPCMPEQ (VEC_SIZE * 2)(%rsi), %VZERO, %k0
VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi)
KMOV %k0, %VRCX
# if CHAR_PER_VEC < 64
/* This essentiallys adds CHAR_PER_VEC to computed result. */
shlq $CHAR_PER_VEC, %rcx
# else
tzcntq %rcx, %rcx
addl $CHAR_PER_VEC, %ecx
# endif
.p2align 4,, 4
L(ret_vec_x1_len):
/* If CHAR_PER_VEC < 64 we still need to tzcnt, otherwise it has
already been done. */
# if CHAR_PER_VEC < 64
tzcntq %rcx, %rcx
# endif
cmpl %ecx, %edx
jbe L(ret_vec_x1_len_no_zfill)
/* Fall through (expectation) is copy len < buffer len. */
VMOVU %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
L(ret_vec_x1_len_no_zfill_mov):
movl %ecx, %edx
# ifdef USE_AS_STPCPY
/* clear flags. */
xorl %ecx, %ecx
# endif
L(ret_vec_x1_len_no_zfill):
VMOVU ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
VMOVU %VMM(0), ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
# ifdef USE_AS_STPCPY
# ifdef USE_AS_WCSCPY
adcq $0, %rdx
leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
# else
leal (VEC_SIZE)(%rdx), %eax
adcq %rdi, %rax
# endif
# endif
ret
.p2align 4,, 10
L(ret_vec_x1):
bsf %VRCX, %VRCX
L(ret_vec_x1_no_bsf):
VMOVU %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
subl %ecx, %edx
cmpl $CHAR_PER_VEC, %edx
jb L(ret_vec_x1_len_no_zfill_mov)
/* Fall through (expectation) is copy len < buffer len. */
VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi)
VMOVU %VZERO, (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE)
# ifdef USE_AS_STPCPY
leaq (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE), %rax
# endif
ret
.p2align 4,, 8
L(last_4x_vec):
/* Seperate logic for CHAR_PER_VEC == 64 because we can do `andl
$(CHAR_PER_VEC * 4 - 1), %edx` with less code size just
using `movzbl`. */
# if CHAR_PER_VEC == 64
movzbl %dl, %edx
# else
andl $(CHAR_PER_VEC * 4 - 1), %edx
# endif
VMOVA (VEC_SIZE * 5)(%rsi), %VMM(1)
VPTESTN %VMM(1), %VMM(1), %k0
KMOV %k0, %VRCX
subq $-(VEC_SIZE * 4), %rsi
subq $-(VEC_SIZE * 4), %rdi
cmpl $(CHAR_PER_VEC * 2 - 1), %edx
jbe L(last_2x_vec)
.p2align 4,, 8
L(more_2x_vec):
VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi)
test %VRCX, %VRCX
/* Must fill at least 2x VEC. */
jnz L(zfill_vec1)
VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2)
VMOVU %VMM(2), (VEC_SIZE * 2)(%rdi)
VPTESTN %VMM(2), %VMM(2), %k0
KMOV %k0, %VRCX
test %VRCX, %VRCX
/* Must fill at least 1x VEC. */
jnz L(zfill_vec2)
VMOVA (VEC_SIZE * 3)(%rsi), %VMM(3)
VPTESTN %VMM(3), %VMM(3), %k0
KMOV %k0, %VRCX
/* Check if len is more 4x VEC. -1 because rdx is len - 1. */
cmpq $(CHAR_PER_VEC * 4 - 1), %rdx
ja L(more_4x_vec)
subl $(CHAR_PER_VEC * 3), %edx
jb L(ret_vec_x3_len)
test %VRCX, %VRCX
jnz L(ret_vec_x3)
VPCMPEQ (VEC_SIZE * 4)(%rsi), %VZERO, %k0
VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi)
KMOV %k0, %VRCX
tzcnt %VRCX, %VRCX
cmpl %ecx, %edx
jbe L(ret_vec_x4_len_no_zfill)
/* Fall through (expectation) is copy len < buffer len. */
VMOVU %VZERO, ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
movl %ecx, %edx
L(ret_vec_x4_len_no_zfill):
VMOVU ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
VMOVU %VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
# ifdef USE_AS_STPCPY
# ifdef USE_AS_WCSCPY
adcq $0, %rdx
leaq (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rax
# else
leal (VEC_SIZE * 4 + 0)(%rdx), %eax
adcq %rdi, %rax
# endif
# endif
ret
L(ret_vec_x3_len):
addl $(CHAR_PER_VEC * 1), %edx
tzcnt %VRCX, %VRCX
cmpl %ecx, %edx
jbe L(ret_vec_x3_len_no_zfill)
/* Fall through (expectation) is copy len < buffer len. */
VMOVU %VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
L(ret_vec_x3_len_no_zfill_mov):
movl %ecx, %edx
# ifdef USE_AS_STPCPY
/* clear flags. */
xorl %ecx, %ecx
# endif
.p2align 4,, 4
L(ret_vec_x3_len_no_zfill):
VMOVU ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
VMOVU %VMM(0), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
# ifdef USE_AS_STPCPY
# ifdef USE_AS_WCSCPY
adcq $0, %rdx
leaq (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE), %rax
# else
leal (VEC_SIZE * 3 + 0)(%rdx), %eax
adcq %rdi, %rax
# endif
# endif
ret
.p2align 4,, 8
L(ret_vec_x3):
bsf %VRCX, %VRCX
VMOVU %VZERO, (VEC_SIZE * 4 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx, CHAR_SIZE)
subl %ecx, %edx
jl L(ret_vec_x3_len_no_zfill_mov)
VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi)
VMOVU %VZERO, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE)
# ifdef USE_AS_STPCPY
leaq (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
# endif
ret
.p2align 4,, 8
L(more_4x_vec):
VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi)
test %VRCX, %VRCX
jnz L(zfill_vec3)
VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4)
VMOVU %VMM(4), (VEC_SIZE * 4)(%rdi)
VPTESTN %VMM(4), %VMM(4), %k0
KMOV %k0, %VRCX
test %VRCX, %VRCX
jnz L(zfill_vec4)
/* Recheck length before aligning. */
cmpq $(CHAR_PER_VEC * 8 - 1), %rdx
jbe L(last_4x_vec)
/* Align rsi to VEC_SIZE * 4, need to readjust rdx / rdi. */
# ifdef USE_AS_WCSCPY
leaq (%rsi, %rdx, CHAR_SIZE), %rdx
# else
addq %rsi, %rdx
# endif
subq %rsi, %rdi
subq $-(VEC_SIZE * 5), %rsi
andq $(VEC_SIZE * -4), %rsi
/* Load first half of the loop before entry. */
VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
VPMIN %VMM(0), %VMM(1), %VMM(4)
VPMIN %VMM(2), %VMM(3), %VMM(6)
VPTESTN %VMM(4), %VMM(4), %k2
VPTESTN %VMM(6), %VMM(6), %k4
/* Offset rsi by VEC_SIZE so that we can jump to
L(loop_last_4x_vec). */
addq $-(VEC_SIZE), %rsi
KORTEST %k2, %k4
jnz L(loop_4x_done)
/* Store loop end in r9. */
leaq -(VEC_SIZE * 5 - CHAR_SIZE)(%rdx), %r9
.p2align 4,, 11
L(loop_4x_vec):
VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
VMOVU %VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi)
subq $(VEC_SIZE * -4), %rsi
cmpq %rsi, %r9
jbe L(loop_last_4x_vec)
VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(0)
VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(1)
VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(2)
VMOVA (VEC_SIZE * 4 + 0)(%rsi), %VMM(3)
VPMIN %VMM(0), %VMM(1), %VMM(4)
VPMIN %VMM(2), %VMM(3), %VMM(6)
VPTESTN %VMM(4), %VMM(4), %k2
VPTESTN %VMM(6), %VMM(6), %k4
KORTEST %k2, %k4
jz L(loop_4x_vec)
L(loop_4x_done):
/* Restore rdx (length). */
subq %rsi, %rdx
# ifdef USE_AS_WCSCPY
shrq $2, %rdx
# endif
VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
/* Restore rdi (dst). */
addq %rsi, %rdi
VPTESTN %VMM(0), %VMM(0), %k0
KMOV %k0, %VRCX
test %VRCX, %VRCX
jnz L(zfill_vec1)
VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi)
KMOV %k2, %VRCX
test %VRCX, %VRCX
jnz L(zfill_vec2)
VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi)
VPTESTN %VMM(2), %VMM(2), %k0
KMOV %k0, %VRCX
test %VRCX, %VRCX
jnz L(zfill_vec3)
VMOVU %VMM(3), (VEC_SIZE * 4 + 0)(%rdi)
KMOV %k4, %VRCX
// Zfill more....
.p2align 4,, 4
L(zfill_vec4):
subq $(VEC_SIZE * -2), %rdi
addq $(CHAR_PER_VEC * -2), %rdx
L(zfill_vec2):
subq $(VEC_SIZE * -2), %rdi
addq $(CHAR_PER_VEC * -1), %rdx
L(zfill):
/* VRCX must be non-zero. */
bsf %VRCX, %VRCX
/* Adjust length / dst for zfill. */
subq %rcx, %rdx
# ifdef USE_AS_WCSCPY
leaq (%rdi, %rcx, CHAR_SIZE), %rdi
# else
addq %rcx, %rdi
# endif
# ifdef USE_AS_STPCPY
movq %rdi, %rax
# endif
L(zfill_from_page_cross):
/* From here on out its just memset(rdi, 0, rdx). */
cmpq $CHAR_PER_VEC, %rdx
jb L(zfill_less_vec)
L(zfill_more_1x_vec):
VMOVU %VZERO, (%rdi)
VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
cmpq $(CHAR_PER_VEC * 2 - 1), %rdx
ja L(zfill_more_2x_vec)
L(zfill_done0):
ret
/* Coming from vec1/vec2 we must be able to zfill at least 2x
VEC. */
.p2align 4,, 8
L(zfill_vec3):
subq $(VEC_SIZE * -2), %rdi
addq $(CHAR_PER_VEC * -2), %rdx
.p2align 4,, 2
L(zfill_vec1):
bsfq %rcx, %rcx
/* rdi is currently dst - VEC_SIZE so add back VEC_SIZE here.
*/
leaq VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
subq %rcx, %rdx
# ifdef USE_AS_STPCPY
movq %rdi, %rax
# endif
VMOVU %VZERO, (%rdi)
VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
cmpq $(CHAR_PER_VEC * 2), %rdx
jb L(zfill_done0)
L(zfill_more_2x_vec):
VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
VMOVU %VZERO, (VEC_SIZE)(%rdi)
subq $(CHAR_PER_VEC * 4 - 1), %rdx
jbe L(zfill_done)
# ifdef USE_AS_WCSCPY
leaq (%rdi, %rdx, CHAR_SIZE), %rdx
# else
addq %rdi, %rdx
# endif
VMOVU %VZERO, (VEC_SIZE * 2)(%rdi)
VMOVU %VZERO, (VEC_SIZE * 3)(%rdi)
VMOVU %VZERO, (VEC_SIZE * 0 + 0)(%rdx)
VMOVU %VZERO, (VEC_SIZE * 1 + 0)(%rdx)
subq $-(VEC_SIZE * 4), %rdi
cmpq %rdi, %rdx
jbe L(zfill_done)
/* Align rdi and zfill loop. */
andq $-(VEC_SIZE), %rdi
.p2align 4,, 12
L(zfill_loop_4x_vec):
VMOVA %VZERO, (VEC_SIZE * 0)(%rdi)
VMOVA %VZERO, (VEC_SIZE * 1)(%rdi)
VMOVA %VZERO, (VEC_SIZE * 2)(%rdi)
VMOVA %VZERO, (VEC_SIZE * 3)(%rdi)
subq $-(VEC_SIZE * 4), %rdi
cmpq %rdi, %rdx
ja L(zfill_loop_4x_vec)
L(zfill_done):
ret
/* Less 1x VEC case if we are not using evex masked store. */
# if !USE_EVEX_MASKED_STORE
.p2align 4,, 8
L(copy_1x):
/* Special case for copy 1x. It can be handled quickly and many
buffer sizes have convenient alignment. */
VMOVU %VMM(0), (%rdi)
/* If no zeros then we are done. */
testl %ecx, %ecx
jz L(ret_1x_1x)
/* Need to zfill, not we know that length <= CHAR_PER_VEC so we
only handle the small case here. */
bsf %VRCX, %VRCX
L(zfill_less_vec_no_bsf):
/* Adjust length / dst then just zfill less_vec. */
subq %rcx, %rdx
# ifdef USE_AS_WCSCPY
leaq (%rdi, %rcx, CHAR_SIZE), %rdi
# else
addq %rcx, %rdi
# endif
# ifdef USE_AS_STPCPY
movq %rdi, %rax
# endif
L(zfill_less_vec):
cmpl $((VEC_SIZE / 2) / CHAR_SIZE), %edx
jb L(zfill_less_half)
VMOVU %VZERO_HALF, (%rdi)
VMOVU %VZERO_HALF, -((VEC_SIZE / 2)- CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
ret
# ifdef USE_AS_STPCPY
L(ret_1x_1x):
leaq CHAR_SIZE(%rdi, %rdx, CHAR_SIZE), %rax
ret
# endif
# if VEC_SIZE == 64
.p2align 4,, 4
L(copy_32_63):
/* Overfill to avoid branches. */
VMOVU -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
VMOVU %VMM_256(0), (%rdi)
VMOVU %VMM_256(1), -(32 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
/* We are taking advantage of the fact that to be here we must
be writing null-term as (%rdi, %rcx) we have a byte of lee-
way for overwriting. */
cmpl %ecx, %edx
ja L(zfill_less_vec_no_bsf)
# ifndef USE_AS_STPCPY
L(ret_1x_1x):
# else
# ifdef USE_AS_WCSCPY
adcq $0, %rdx
leaq (%rdi, %rdx, CHAR_SIZE), %rax
# else
movl %edx, %eax
adcq %rdi, %rax
# endif
# endif
ret
# endif
.p2align 4,, 4
L(copy_16_31):
/* Overfill to avoid branches. */
vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
VMOVU %VMM_128(0), (%rdi)
vmovdqu %xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
cmpl %ecx, %edx
/* Seperate logic depending on VEC_SIZE. If VEC_SIZE == 64 then
we have a larger copy block for 32-63 so this is just falls
through to zfill 16-31. If VEC_SIZE == 32 then we check for
full zfill of less 1x VEC. */
# if VEC_SIZE == 64
jbe L(ret_16_31)
subl %ecx, %edx
# ifdef USE_AS_WCSCPY
leaq (%rdi, %rcx, CHAR_SIZE), %rdi
# else
addq %rcx, %rdi
# endif
# ifdef USE_AS_STPCPY
movq %rdi, %rax
# endif
L(zfill_less_half):
L(zfill_less_32):
cmpl $(16 / CHAR_SIZE), %edx
jb L(zfill_less_16)
VMOVU %VZERO_128, (%rdi)
VMOVU %VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
# ifdef USE_AS_STPCPY
ret
# endif
L(ret_16_31):
# ifdef USE_AS_STPCPY
# ifdef USE_AS_WCSCPY
adcq $0, %rdx
leaq (%rdi, %rdx, CHAR_SIZE), %rax
# else
movl %edx, %eax
adcq %rdi, %rax
# endif
# endif
ret
# else
/* VEC_SIZE == 32 begins. */
ja L(zfill_less_vec_no_bsf)
# ifndef USE_AS_STPCPY
L(ret_1x_1x):
# else
# ifdef USE_AS_WCSCPY
adcq $0, %rdx
leaq (%rdi, %rdx, CHAR_SIZE), %rax
# else
movl %edx, %eax
adcq %rdi, %rax
# endif
# endif
ret
# endif
.p2align 4,, 4
L(copy_8_15):
/* Overfill to avoid branches. */
movq -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
vmovq %VMM_128(0), (%rdi)
movq %rsi, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
cmpl %ecx, %edx
jbe L(ret_8_15)
subl %ecx, %edx
# ifdef USE_AS_WCSCPY
leaq (%rdi, %rcx, CHAR_SIZE), %rdi
# else
addq %rcx, %rdi
# endif
# ifdef USE_AS_STPCPY
movq %rdi, %rax
# endif
.p2align 4,, 8
# if VEC_SIZE == 32
L(zfill_less_half):
# endif
L(zfill_less_16):
xorl %ecx, %ecx
cmpl $(8 / CHAR_SIZE), %edx
jb L(zfill_less_8)
movq %rcx, (%rdi)
movq %rcx, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
# ifndef USE_AS_STPCPY
L(ret_8_15):
# endif
ret
.p2align 4,, 8
L(less_1x_vec):
je L(copy_1x)
/* We will need `tzcnt` result for all other copy sizes. */
tzcnt %VRCX, %VRCX
# if VEC_SIZE == 64
cmpl $(32 / CHAR_SIZE), %edx
jae L(copy_32_63)
# endif
cmpl $(16 / CHAR_SIZE), %edx
jae L(copy_16_31)
cmpl $(8 / CHAR_SIZE), %edx
jae L(copy_8_15)
# ifdef USE_AS_WCSCPY
testl %ecx, %ecx
jz L(zfill_less_8_set_ret)
movl (%rsi, %rdx, CHAR_SIZE), %esi
vmovd %VMM_128(0), (%rdi)
movl %esi, (%rdi, %rdx, CHAR_SIZE)
# ifdef USE_AS_STPCPY
cmpl %ecx, %edx
L(ret_8_15):
adcq $0, %rdx
leaq (%rdi, %rdx, CHAR_SIZE), %rax
# endif
ret
L(zfill_less_8_set_ret):
xorl %ecx, %ecx
# ifdef USE_AS_STPCPY
movq %rdi, %rax
# endif
L(zfill_less_8):
movl %ecx, (%rdi)
movl %ecx, (%rdi, %rdx, CHAR_SIZE)
ret
# else
cmpl $3, %edx
jb L(copy_0_3)
/* Overfill to avoid branches. */
movl -3(%rsi, %rdx), %esi
vmovd %VMM_128(0), (%rdi)
movl %esi, -3(%rdi, %rdx)
cmpl %ecx, %edx
jbe L(ret_4_7)
subq %rcx, %rdx
addq %rcx, %rdi
# ifdef USE_AS_STPCPY
movq %rdi, %rax
# endif
xorl %ecx, %ecx
.p2align 4,, 8
L(zfill_less_8):
cmpl $3, %edx
jb L(zfill_less_3)
movl %ecx, (%rdi)
movl %ecx, -3(%rdi, %rdx)
# ifdef USE_AS_STPCPY
ret
# endif
L(ret_4_7):
# ifdef USE_AS_STPCPY
L(ret_8_15):
movl %edx, %eax
adcq %rdi, %rax
# endif
ret
.p2align 4,, 4
L(zfill_less_3):
testl %edx, %edx
jz L(zfill_1)
movw %cx, (%rdi)
L(zfill_1):
movb %cl, (%rdi, %rdx)
ret
.p2align 4,, 8
L(copy_0_3):
vmovd %VMM_128(0), %r8d
testl %edx, %edx
jz L(copy_1)
movw %r8w, (%rdi)
cmpl %ecx, %edx
ja L(zfill_from_1)
movzbl (%rsi, %rdx), %r8d
# ifdef USE_AS_STPCPY
movl %edx, %eax
adcq %rdi, %rax
movb %r8b, (%rdi, %rdx)
ret
# endif
L(copy_1):
# ifdef USE_AS_STPCPY
movl %edx, %eax
cmpl %ecx, %edx
adcq %rdi, %rax
# endif
# ifdef USE_AS_WCSCPY
vmovd %VMM_128(0), (%rdi)
# else
movb %r8b, (%rdi, %rdx)
# endif
ret
# endif
# ifndef USE_AS_WCSCPY
.p2align 4,, 8
L(zfill_from_1):
# ifdef USE_AS_STPCPY
leaq (%rdi, %rcx), %rax
# endif
movw $0, -1(%rdi, %rdx)
ret
# endif
.p2align 4,, 4
L(zero_len):
incq %rdx
jne L(best_effort_strncpy)
movq %rdi, %rax
ret
# endif
.p2align 4,, 4
.p2align 6,, 8
L(page_cross):
movq %rsi, %rax
andq $(VEC_SIZE * -1), %rax
VPCMPEQ (%rax), %VZERO, %k0
KMOV %k0, %VRCX
# ifdef USE_AS_WCSCPY
movl %esi, %r8d
shrl $2, %r8d
andl $(CHAR_PER_VEC - 1), %r8d
shrx %VR8, %VRCX, %VRCX
# else
shrx %VRSI, %VRCX, %VRCX
# endif
/* Compute amount of bytes we checked. */
subl %esi, %eax
andl $(VEC_SIZE - 1), %eax
# ifdef USE_AS_WCSCPY
shrl $2, %eax
# endif
/* If rax > rdx then we are finishing the copy at the end of the
page. */
cmpq %rax, %rdx
jb L(page_cross_small)
/* If rcx is non-zero then continue. */
test %VRCX, %VRCX
jz L(page_cross_continue)
/* We found zero-CHAR so need to copy then zfill (we know we
didn't cover all of length here). */
bsf %VRCX, %VRCX
L(movsb_and_zfill):
incl %ecx
subq %rcx, %rdx
# ifdef USE_AS_STPCPY
leaq -CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rax
# else
movq %rdi, %rax
# endif
REP_MOVS
# ifdef USE_AS_WCSCPY
movl $0, (%rdi)
# else
movb $0, (%rdi)
# endif
jmp L(zfill_from_page_cross)
L(page_cross_small):
tzcnt %VRCX, %VRCX
cmpl %ecx, %edx
jbe L(page_cross_copy_only)
/* Do a zfill of the tail before copying. */
movq %rdi, %r9
xorl %eax, %eax
movl %ecx, %r8d
subl %ecx, %edx
leaq CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
movl %edx, %ecx
REP_STOS
movq %r9, %rdi
movl %r8d, %edx
L(page_cross_copy_only):
leal 1(%rdx), %ecx
# ifdef USE_AS_STPCPY
# ifdef USE_AS_WCSCPY
adcl $0, %edx
leaq (%rdi, %rdx, CHAR_SIZE), %rax
# else
movl %edx, %eax
adcq %rdi, %rax
# endif
# else
movq %rdi, %rax
# endif
REP_MOVS
ret
L(best_effort_strncpy):
movq %rdx, %rcx
xorl %eax, %eax
movq %rdi, %r8
/* The length is >= 2^63. We very much so expect to segfault at
rep stos. If that doesn't happen then just strcpy to finish.
*/
REP_STOS
movq %r8, %rdi
jmp OVERFLOW_STRCPY
END(STRNCPY)
#endif

View File

@ -0,0 +1,80 @@
/* Helper for getting proper name of overflow fallback function for
{wc|st}{p|r|s}n{cat|cpy}
All versions must be listed in ifunc-impl-list.c.
Copyright (C) 2022 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#ifndef _STRNCPY_OR_CAT_OVERFLOW_DEF_H_
#define _STRNCPY_OR_CAT_OVERFLOW_DEF_H_ 1
#if defined USE_MULTIARCH && IS_IN(libc)
# define UNDERSCORES __
# ifdef USE_WITH_SSE2
# define ISA_EXT _sse2
# elif defined USE_WITH_AVX2
# ifdef USE_WITH_RTM
# define ISA_EXT _avx2_rtm
# else
# define ISA_EXT _avx2
# endif
# elif defined USE_WITH_EVEX256
# define ISA_EXT _evex
# elif defined USE_WITH_EVEX512
# define ISA_EXT _evex512
# endif
#else
# define UNDERSCORES
# define ISA_EXT
#endif
#ifdef USE_AS_WCSCPY
# define STRCPY_PREFIX wc
# define STRCAT_PREFIX wcs
# ifdef USE_AS_STPCPY
# define STRCPY_POSTFIX pcpy
# else
# define STRCPY_POSTFIX scpy
# endif
#else
# define STRCPY_PREFIX st
# define STRCAT_PREFIX str
# ifdef USE_AS_STPCPY
# define STRCPY_POSTFIX pcpy
# else
# define STRCPY_POSTFIX rcpy
# endif
#endif
#define STRCAT_POSTFIX cat
#define PRIMITIVE_OF_NAMER(underscores, prefix, postfix, ext) \
underscores##prefix##postfix##ext
#define OF_NAMER(...) PRIMITIVE_OF_NAMER (__VA_ARGS__)
#ifndef OVERFLOW_STRCPY
# define OVERFLOW_STRCPY \
OF_NAMER (UNDERSCORES, STRCPY_PREFIX, STRCPY_POSTFIX, ISA_EXT)
#endif
#ifndef OVERFLOW_STRCAT
# define OVERFLOW_STRCAT \
OF_NAMER (UNDERSCORES, STRCAT_PREFIX, STRCAT_POSTFIX, ISA_EXT)
#endif
#endif