mirror of
git://sourceware.org/git/glibc.git
synced 2025-01-18 12:16:13 +08:00
x86_64: Add strstr function with 512-bit EVEX
Adding a 512-bit EVEX version of strstr. The algorithm works as follows: (1) We spend a few cycles at the begining to peek into the needle. We locate an edge in the needle (first occurance of 2 consequent distinct characters) and also store the first 64-bytes into a zmm register. (2) We search for the edge in the haystack by looking into one cache line of the haystack at a time. This avoids having to read past a page boundary which can cause a seg fault. (3) If an edge is found in the haystack we first compare the first 64-bytes of the needle (already stored in a zmm register) before we proceed with a full string compare performed byte by byte. Benchmarking results: (old = strstr_sse2_unaligned, new = strstr_avx512) Geometric mean of all benchmarks: new / old = 0.66 Difficult skiptable(0) : new / old = 0.02 Difficult skiptable(1) : new / old = 0.01 Difficult 2-way : new / old = 0.25 Difficult testing first 2 : new / old = 1.26 Difficult skiptable(0) : new / old = 0.05 Difficult skiptable(1) : new / old = 0.06 Difficult 2-way : new / old = 0.26 Difficult testing first 2 : new / old = 1.05 Difficult skiptable(0) : new / old = 0.42 Difficult skiptable(1) : new / old = 0.24 Difficult 2-way : new / old = 0.21 Difficult testing first 2 : new / old = 1.04 Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
This commit is contained in:
parent
8521001731
commit
5082a287d5
@ -116,6 +116,7 @@ sysdep_routines += \
|
||||
strrchr-sse2 \
|
||||
strspn-c \
|
||||
strspn-sse2 \
|
||||
strstr-avx512 \
|
||||
strstr-sse2-unaligned \
|
||||
varshift \
|
||||
# sysdep_routines
|
||||
@ -123,6 +124,7 @@ CFLAGS-varshift.c += -msse4
|
||||
CFLAGS-strcspn-c.c += -msse4
|
||||
CFLAGS-strpbrk-c.c += -msse4
|
||||
CFLAGS-strspn-c.c += -msse4
|
||||
CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq -mavx512bw -mbmi -mbmi2 -O3
|
||||
endif
|
||||
|
||||
ifeq ($(subdir),wcsmbs)
|
||||
|
@ -618,6 +618,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
|
||||
/* Support sysdeps/x86_64/multiarch/strstr.c. */
|
||||
IFUNC_IMPL (i, name, strstr,
|
||||
IFUNC_IMPL_ADD (array, i, strstr,
|
||||
(CPU_FEATURE_USABLE (AVX512VL)
|
||||
&& CPU_FEATURE_USABLE (AVX512BW)
|
||||
&& CPU_FEATURE_USABLE (AVX512DQ)
|
||||
&& CPU_FEATURE_USABLE (BMI2)),
|
||||
__strstr_avx512)
|
||||
IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
|
||||
|
||||
|
214
sysdeps/x86_64/multiarch/strstr-avx512.c
Normal file
214
sysdeps/x86_64/multiarch/strstr-avx512.c
Normal file
@ -0,0 +1,214 @@
|
||||
/* strstr optimized with 512-bit AVX-512 instructions
|
||||
Copyright (C) 2022 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <inttypes.h>
|
||||
#include <stdbool.h>
|
||||
#include <string.h>
|
||||
|
||||
#define FULL_MMASK64 0xffffffffffffffff
|
||||
#define ONE_64BIT 0x1ull
|
||||
#define ZMM_SIZE_IN_BYTES 64
|
||||
#define PAGESIZE 4096
|
||||
|
||||
/*
|
||||
Returns the index of the first edge within the needle, returns 0 if no edge
|
||||
is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg'
|
||||
*/
|
||||
static inline size_t
|
||||
find_edge_in_needle (const char *ned)
|
||||
{
|
||||
size_t ind = 0;
|
||||
while (ned[ind + 1] != '\0')
|
||||
{
|
||||
if (ned[ind] != ned[ind + 1])
|
||||
return ind;
|
||||
else
|
||||
ind = ind + 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
Compare needle with haystack byte by byte at specified location
|
||||
*/
|
||||
static inline bool
|
||||
verify_string_match (const char *hay, const size_t hay_index, const char *ned,
|
||||
size_t ind)
|
||||
{
|
||||
while (ned[ind] != '\0')
|
||||
{
|
||||
if (ned[ind] != hay[hay_index + ind])
|
||||
return false;
|
||||
ind = ind + 1;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
Compare needle with haystack at specified location. The first 64 bytes are
|
||||
compared using a ZMM register.
|
||||
*/
|
||||
static inline bool
|
||||
verify_string_match_avx512 (const char *hay, const size_t hay_index,
|
||||
const char *ned, const __mmask64 ned_mask,
|
||||
const __m512i ned_zmm)
|
||||
{
|
||||
/* check first 64 bytes using zmm and then scalar */
|
||||
__m512i hay_zmm = _mm512_loadu_si512 (hay + hay_index); // safe to do so
|
||||
__mmask64 match = _mm512_mask_cmpneq_epi8_mask (ned_mask, hay_zmm, ned_zmm);
|
||||
if (match != 0x0) // failed the first few chars
|
||||
return false;
|
||||
else if (ned_mask == FULL_MMASK64)
|
||||
return verify_string_match (hay, hay_index, ned, ZMM_SIZE_IN_BYTES);
|
||||
return true;
|
||||
}
|
||||
|
||||
char *
|
||||
__strstr_avx512 (const char *haystack, const char *ned)
|
||||
{
|
||||
char first = ned[0];
|
||||
if (first == '\0')
|
||||
return (char *)haystack;
|
||||
if (ned[1] == '\0')
|
||||
return (char *)strchr (haystack, ned[0]);
|
||||
|
||||
size_t edge = find_edge_in_needle (ned);
|
||||
|
||||
/* ensure haystack is as long as the pos of edge in needle */
|
||||
for (int ii = 0; ii < edge; ++ii)
|
||||
{
|
||||
if (haystack[ii] == '\0')
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
Load 64 bytes of the needle and save it to a zmm register
|
||||
Read one cache line at a time to avoid loading across a page boundary
|
||||
*/
|
||||
__mmask64 ned_load_mask = _bzhi_u64 (
|
||||
FULL_MMASK64, 64 - ((uintptr_t) (ned) & 63));
|
||||
__m512i ned_zmm = _mm512_maskz_loadu_epi8 (ned_load_mask, ned);
|
||||
__mmask64 ned_nullmask
|
||||
= _mm512_mask_testn_epi8_mask (ned_load_mask, ned_zmm, ned_zmm);
|
||||
|
||||
if (__glibc_unlikely (ned_nullmask == 0x0))
|
||||
{
|
||||
ned_zmm = _mm512_loadu_si512 (ned);
|
||||
ned_nullmask = _mm512_testn_epi8_mask (ned_zmm, ned_zmm);
|
||||
ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
|
||||
if (ned_nullmask != 0x0)
|
||||
ned_load_mask = ned_load_mask >> 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
|
||||
ned_load_mask = ned_load_mask >> 1;
|
||||
}
|
||||
const __m512i ned0 = _mm512_set1_epi8 (ned[edge]);
|
||||
const __m512i ned1 = _mm512_set1_epi8 (ned[edge + 1]);
|
||||
|
||||
/*
|
||||
Read the bytes of haystack in the current cache line
|
||||
*/
|
||||
size_t hay_index = edge;
|
||||
__mmask64 loadmask = _bzhi_u64 (
|
||||
FULL_MMASK64, 64 - ((uintptr_t) (haystack + hay_index) & 63));
|
||||
/* First load is a partial cache line */
|
||||
__m512i hay0 = _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index);
|
||||
/* Search for NULL and compare only till null char */
|
||||
uint64_t nullmask
|
||||
= _cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask, hay0, hay0));
|
||||
uint64_t cmpmask = nullmask ^ (nullmask - ONE_64BIT);
|
||||
cmpmask = cmpmask & _cvtmask64_u64 (loadmask);
|
||||
/* Search for the 2 charaters of needle */
|
||||
__mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
|
||||
__mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
|
||||
k1 = _kshiftri_mask64 (k1, 1);
|
||||
/* k2 masks tell us if both chars from needle match */
|
||||
uint64_t k2 = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask;
|
||||
/* For every match, search for the entire needle for a full match */
|
||||
while (k2)
|
||||
{
|
||||
uint64_t bitcount = _tzcnt_u64 (k2);
|
||||
k2 = _blsr_u64 (k2);
|
||||
size_t match_pos = hay_index + bitcount - edge;
|
||||
if (((uintptr_t) (haystack + match_pos) & (PAGESIZE - 1))
|
||||
< PAGESIZE - 1 - ZMM_SIZE_IN_BYTES)
|
||||
{
|
||||
/*
|
||||
* Use vector compare as long as you are not crossing a page
|
||||
*/
|
||||
if (verify_string_match_avx512 (haystack, match_pos, ned,
|
||||
ned_load_mask, ned_zmm))
|
||||
return (char *)haystack + match_pos;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (verify_string_match (haystack, match_pos, ned, 0))
|
||||
return (char *)haystack + match_pos;
|
||||
}
|
||||
}
|
||||
/* We haven't checked for potential match at the last char yet */
|
||||
haystack = (const char *)(((uintptr_t) (haystack + hay_index) | 63));
|
||||
hay_index = 0;
|
||||
|
||||
/*
|
||||
Loop over one cache line at a time to prevent reading over page
|
||||
boundary
|
||||
*/
|
||||
__m512i hay1;
|
||||
while (nullmask == 0)
|
||||
{
|
||||
hay0 = _mm512_loadu_si512 (haystack + hay_index);
|
||||
hay1 = _mm512_load_si512 (haystack + hay_index
|
||||
+ 1); // Always 64 byte aligned
|
||||
nullmask = _cvtmask64_u64 (_mm512_testn_epi8_mask (hay1, hay1));
|
||||
/* Compare only till null char */
|
||||
cmpmask = nullmask ^ (nullmask - ONE_64BIT);
|
||||
k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
|
||||
k1 = _mm512_cmpeq_epi8_mask (hay1, ned1);
|
||||
/* k2 masks tell us if both chars from needle match */
|
||||
k2 = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask;
|
||||
/* For every match, compare full strings for potential match */
|
||||
while (k2)
|
||||
{
|
||||
uint64_t bitcount = _tzcnt_u64 (k2);
|
||||
k2 = _blsr_u64 (k2);
|
||||
size_t match_pos = hay_index + bitcount - edge;
|
||||
if (((uintptr_t) (haystack + match_pos) & (PAGESIZE - 1))
|
||||
< PAGESIZE - 1 - ZMM_SIZE_IN_BYTES)
|
||||
{
|
||||
/*
|
||||
* Use vector compare as long as you are not crossing a page
|
||||
*/
|
||||
if (verify_string_match_avx512 (haystack, match_pos, ned,
|
||||
ned_load_mask, ned_zmm))
|
||||
return (char *)haystack + match_pos;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Compare byte by byte */
|
||||
if (verify_string_match (haystack, match_pos, ned, 0))
|
||||
return (char *)haystack + match_pos;
|
||||
}
|
||||
}
|
||||
hay_index += ZMM_SIZE_IN_BYTES;
|
||||
}
|
||||
return NULL;
|
||||
}
|
@ -35,16 +35,32 @@
|
||||
|
||||
extern __typeof (__redirect_strstr) __strstr_sse2_unaligned attribute_hidden;
|
||||
extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden;
|
||||
extern __typeof (__redirect_strstr) __strstr_avx512 attribute_hidden;
|
||||
|
||||
#include "init-arch.h"
|
||||
|
||||
/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
|
||||
ifunc symbol properly. */
|
||||
extern __typeof (__redirect_strstr) __libc_strstr;
|
||||
libc_ifunc (__libc_strstr,
|
||||
HAS_ARCH_FEATURE (Fast_Unaligned_Load)
|
||||
? __strstr_sse2_unaligned
|
||||
: __strstr_sse2)
|
||||
|
||||
static inline void *
|
||||
IFUNC_SELECTOR (void)
|
||||
{
|
||||
const struct cpu_features *cpu_features = __get_cpu_features ();
|
||||
|
||||
if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
|
||||
&& CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
||||
&& CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
|
||||
&& CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ)
|
||||
&& CPU_FEATURE_USABLE_P (cpu_features, BMI2))
|
||||
return __strstr_avx512;
|
||||
|
||||
if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
|
||||
return __strstr_sse2_unaligned;
|
||||
|
||||
return __strstr_sse2;
|
||||
}
|
||||
|
||||
libc_ifunc_redirected (__redirect_strstr, __libc_strstr, IFUNC_SELECTOR ());
|
||||
#undef strstr
|
||||
strong_alias (__libc_strstr, strstr)
|
||||
|
Loading…
Reference in New Issue
Block a user