x86: Add avx2 optimized functions for the wchar_t strcpy family

Implemented:
    wcscat-avx2  (+ 744 bytes
    wcscpy-avx2  (+ 539 bytes)
    wcpcpy-avx2  (+ 577 bytes)
    wcsncpy-avx2 (+1108 bytes)
    wcpncpy-avx2 (+1214 bytes)
    wcsncat-avx2 (+1085 bytes)

Performance Changes:
    Times are from N = 10 runs of the benchmark suite and are reported
    as geometric mean of all ratios of New Implementation / Best Old
    Implementation. Best Old Implementation was determined with the
    highest ISA implementation.

    wcscat-avx2     -> 0.975
    wcscpy-avx2     -> 0.591
    wcpcpy-avx2     -> 0.698
    wcsncpy-avx2    -> 0.730
    wcpncpy-avx2    -> 0.711
    wcsncat-avx2    -> 0.954

Code Size Changes:
    This change  increase the size of libc.so by ~5.5kb bytes. For
    reference the patch optimizing the normal strcpy family functions
    decreases libc.so by ~5.2kb.

Full check passes on x86-64 and build succeeds for all ISA levels w/
and w/o multiarch.
This commit is contained in:
Noah Goldstein 2022-11-08 17:38:41 -08:00
parent 64b8b6516b
commit 52cf11004e
27 changed files with 115 additions and 18 deletions

View File

@ -131,10 +131,13 @@ endif
ifeq ($(subdir),wcsmbs)
sysdep_routines += \
wcpcpy-avx2 \
wcpcpy-evex \
wcpcpy-generic \
wcpncpy-avx2 \
wcpncpy-evex \
wcpncpy-generic \
wcscat-avx2 \
wcscat-evex \
wcscat-generic \
wcschr-avx2 \
@ -146,6 +149,7 @@ sysdep_routines += \
wcscmp-avx2-rtm \
wcscmp-evex \
wcscmp-sse2 \
wcscpy-avx2 \
wcscpy-evex \
wcscpy-generic \
wcscpy-ssse3 \
@ -155,11 +159,13 @@ sysdep_routines += \
wcslen-evex512 \
wcslen-sse2 \
wcslen-sse4_1 \
wcsncat-avx2 \
wcsncat-evex \
wcsncat-generic \
wcsncmp-avx2 \
wcsncmp-avx2-rtm \
wcsncmp-evex \
wcsncpy-avx2 \
wcsncpy-evex \
wcsncpy-generic \
wcsnlen-avx2 \

View File

@ -907,6 +907,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (BMI2)),
__wcscpy_evex)
X86_IFUNC_IMPL_ADD_V3 (array, i, wcscpy,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)),
__wcscpy_avx2)
X86_IFUNC_IMPL_ADD_V2 (array, i, wcscpy,
CPU_FEATURE_USABLE (SSSE3),
__wcscpy_ssse3)
X86_IFUNC_IMPL_ADD_V1 (array, i, wcscpy,
@ -920,7 +924,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__wcsncpy_evex)
X86_IFUNC_IMPL_ADD_V3 (array, i, wcpncpy,
X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncpy,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)),
__wcsncpy_avx2)
X86_IFUNC_IMPL_ADD_V2 (array, i, wcpncpy,
1,
__wcsncpy_generic))
@ -932,6 +940,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (BMI2)),
__wcpcpy_evex)
X86_IFUNC_IMPL_ADD_V3 (array, i, wcpcpy,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)),
__wcpcpy_avx2)
X86_IFUNC_IMPL_ADD_V2 (array, i, wcpcpy,
1,
__wcpcpy_generic))
@ -942,7 +954,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__wcpncpy_evex)
X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncpy,
X86_IFUNC_IMPL_ADD_V3 (array, i, wcpncpy,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)),
__wcpncpy_avx2)
X86_IFUNC_IMPL_ADD_V2 (array, i, wcsncpy,
1,
__wcpncpy_generic))
@ -954,6 +970,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (BMI2)),
__wcscat_evex)
X86_IFUNC_IMPL_ADD_V3 (array, i, wcscat,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)),
__wcscat_avx2)
X86_IFUNC_IMPL_ADD_V2 (array, i, wcscat,
1,
__wcscat_generic))
@ -965,6 +985,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (BMI2)),
__wcsncat_evex)
X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncat,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)),
__wcsncat_avx2)
X86_IFUNC_IMPL_ADD_V2 (array, i, wcsncat,
1,
__wcsncat_generic))

View File

@ -27,6 +27,8 @@
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (GENERIC) attribute_hidden;
static inline void *
@ -42,6 +44,11 @@ IFUNC_SELECTOR (void)
if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
&& X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
return OPTIMIZE (evex);
if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
Prefer_No_VZEROUPPER, !))
return OPTIMIZE (avx2);
}
return OPTIMIZE (GENERIC);

View File

@ -0,0 +1,8 @@
#ifndef WCPCPY
# define WCPCPY __wcpcpy_avx2
#endif
#define USE_AS_STPCPY
#define USE_AS_WCSCPY
#define STRCPY WCPCPY
#include "strcpy-avx2.S"

View File

@ -19,7 +19,7 @@
/* We always need to build this implementation as strspn-sse4 needs to
be able to fallback to it. */
#include <isa-level.h>
#if ISA_SHOULD_BUILD (3)
#if ISA_SHOULD_BUILD (2)
# define WCPCPY __wcpcpy_generic
# include <wcsmbs/wcpcpy.c>

View File

@ -0,0 +1,8 @@
#ifndef WCPNCPY
# define WCPNCPY __wcpncpy_avx2
#endif
#define USE_AS_WCSCPY
#define USE_AS_STPCPY
#define STRNCPY WCPNCPY
#include "strncpy-avx2.S"

View File

@ -19,7 +19,7 @@
/* We always need to build this implementation as strspn-sse4 needs to
be able to fallback to it. */
#include <isa-level.h>
#if ISA_SHOULD_BUILD (3)
#if ISA_SHOULD_BUILD (2)
# define WCPNCPY __wcpncpy_generic
# include <wcsmbs/wcpncpy.c>

View File

@ -0,0 +1,10 @@
#ifndef WCSCAT
# define WCSCAT __wcscat_avx2
#endif
#define USE_AS_WCSCPY
#define USE_AS_STRCAT
#define STRCPY WCSCAT
#include "strcpy-avx2.S"

View File

@ -19,7 +19,7 @@
/* We always need to build this implementation as strspn-sse4 needs to
be able to fallback to it. */
#include <isa-level.h>
#if ISA_SHOULD_BUILD (3)
#if ISA_SHOULD_BUILD (2)
# define WCSCAT __wcscat_generic
# include <wcsmbs/wcscat.c>

View File

@ -0,0 +1,7 @@
#ifndef WCSCPY
# define WCSCPY __wcscpy_avx2
#endif
#define USE_AS_WCSCPY
#define STRCPY WCSCPY
#include "strcpy-avx2.S"

View File

@ -18,7 +18,7 @@
#include <isa-level.h>
#if ISA_SHOULD_BUILD (3)
#if ISA_SHOULD_BUILD (2)
# define WCSCPY __wcscpy_generic
# include <wcsmbs/wcscpy.c>

View File

@ -28,6 +28,8 @@
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden;
@ -44,6 +46,9 @@ IFUNC_SELECTOR (void)
if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
&& X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
return OPTIMIZE (evex);
if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER, !))
return OPTIMIZE (avx2);
}
if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, SSSE3))

View File

@ -0,0 +1,9 @@
#ifndef WCSNCAT
# define WCSNCAT __wcsncat_avx2
#endif
#define USE_AS_WCSCPY
#define USE_AS_STRCAT
#define STRNCAT WCSNCAT
#include "strncat-avx2.S"

View File

@ -19,7 +19,7 @@
/* We always need to build this implementation as strspn-sse4 needs to
be able to fallback to it. */
#include <isa-level.h>
#if ISA_SHOULD_BUILD (3)
#if ISA_SHOULD_BUILD (2)
# define WCSNCAT __wcsncat_generic
# include <wcsmbs/wcsncat.c>

View File

@ -0,0 +1,7 @@
#ifndef WCSNCPY
# define WCSNCPY __wcsncpy_avx2
#endif
#define USE_AS_WCSCPY
#define STRNCPY WCSNCPY
#include "strncpy-avx2.S"

View File

@ -19,7 +19,7 @@
/* We always need to build this implementation as strspn-sse4 needs to
be able to fallback to it. */
#include <isa-level.h>
#if ISA_SHOULD_BUILD (3)
#if ISA_SHOULD_BUILD (2)
# define WCSNCPY __wcsncpy_generic
# include <wcsmbs/wcsncpy.c>

View File

@ -24,7 +24,7 @@
#include <isa-level.h>
#if MINIMUM_X86_ISA_LEVEL <= 3
#if MINIMUM_X86_ISA_LEVEL <= 2
# include <wcsmbs/wcpcpy.c>

View File

@ -24,11 +24,12 @@
#include <isa-level.h>
#if MINIMUM_X86_ISA_LEVEL >= 4
#if MINIMUM_X86_ISA_LEVEL >= 3
# define WCPCPY __wcpcpy
# define DEFAULT_IMPL_V4 "multiarch/wcpcpy-evex.S"
# define DEFAULT_IMPL_V3 "multiarch/wcpcpy-avx2.S"
/* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
should never be used from here. */
# define DEFAULT_IMPL_V1 "ERROR -- Invalid ISA IMPL"

View File

@ -24,7 +24,7 @@
#include <isa-level.h>
#if MINIMUM_X86_ISA_LEVEL <= 3
#if MINIMUM_X86_ISA_LEVEL <= 2
# include <wcsmbs/wcpncpy.c>

View File

@ -24,11 +24,12 @@
#include <isa-level.h>
#if MINIMUM_X86_ISA_LEVEL >= 4
#if MINIMUM_X86_ISA_LEVEL >= 3
# define WCPNCPY __wcpncpy
# define DEFAULT_IMPL_V4 "multiarch/wcpncpy-evex.S"
# define DEFAULT_IMPL_V3 "multiarch/wcpncpy-avx2.S"
/* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
should never be used from here. */
# define DEFAULT_IMPL_V1 "ERROR -- Invalid ISA IMPL"

View File

@ -24,7 +24,7 @@
#include <isa-level.h>
#if MINIMUM_X86_ISA_LEVEL <= 3
#if MINIMUM_X86_ISA_LEVEL <= 2
# include <wcsmbs/wcscat.c>

View File

@ -24,11 +24,12 @@
#include <isa-level.h>
#if MINIMUM_X86_ISA_LEVEL >= 4
#if MINIMUM_X86_ISA_LEVEL >= 3
# define WCSCAT __wcscat
# define DEFAULT_IMPL_V4 "multiarch/wcscat-evex.S"
# define DEFAULT_IMPL_V3 "multiarch/wcscat-avx2.S"
/* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
should never be used from here. */
# define DEFAULT_IMPL_V1 "ERROR -- Invalid ISA IMPL"

View File

@ -29,6 +29,7 @@
# define WCSCPY __wcscpy
# define DEFAULT_IMPL_V4 "multiarch/wcscpy-evex.S"
# define DEFAULT_IMPL_V3 "multiarch/wcscpy-avx2.S"
# define DEFAULT_IMPL_V2 "multiarch/wcscpy-ssse3.S"
/* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
should never be used from here. */

View File

@ -24,7 +24,7 @@
#include <isa-level.h>
#if MINIMUM_X86_ISA_LEVEL <= 3
#if MINIMUM_X86_ISA_LEVEL <= 2
# include <wcsmbs/wcsncat.c>

View File

@ -24,11 +24,12 @@
#include <isa-level.h>
#if MINIMUM_X86_ISA_LEVEL >= 4
#if MINIMUM_X86_ISA_LEVEL >= 3
# define WCSNCAT wcsncat
# define DEFAULT_IMPL_V4 "multiarch/wcsncat-evex.S"
# define DEFAULT_IMPL_V3 "multiarch/wcsncat-avx2.S"
/* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
should never be used from here. */
# define DEFAULT_IMPL_V1 "ERROR -- Invalid ISA IMPL"

View File

@ -24,7 +24,7 @@
#include <isa-level.h>
#if MINIMUM_X86_ISA_LEVEL <= 3
#if MINIMUM_X86_ISA_LEVEL <= 2
# include <wcsmbs/wcsncpy.c>

View File

@ -24,11 +24,12 @@
#include <isa-level.h>
#if MINIMUM_X86_ISA_LEVEL >= 4
#if MINIMUM_X86_ISA_LEVEL >= 3
# define WCSNCPY __wcsncpy
# define DEFAULT_IMPL_V4 "multiarch/wcsncpy-evex.S"
# define DEFAULT_IMPL_V3 "multiarch/wcsncpy-avx2.S"
/* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
should never be used from here. */
# define DEFAULT_IMPL_V1 "ERROR -- Invalid ISA IMPL"