Speed up SSE4.2 strcasestr by avoiding indirect function call.

This commit is contained in:
Ulrich Drepper 2010-07-16 15:37:38 -07:00
parent 9b059f9774
commit cc9f2e47a0
5 changed files with 86 additions and 49 deletions

View File

@ -1,3 +1,13 @@
2010-07-16 Ulrich Drepper <drepper@redhat.com>
* sysdeps/x86_64/multiarch/strstr.c: Rewrite to avoid indirect function
call in strcasestr.
* sysdeps/x86_64/multiarch/strcasestr.c: Declare
__strcasestr_sse42_nonascii.
* sysdeps/x86_64/multiarch/Makefile: Add rules to build
strcasestr-nonascii.c.
* sysdeps/x86_64/multiarch/strcasestr-nonascii.c: New file.
2010-06-15 Luis Machado <luisgpm@br.ibm.com> 2010-06-15 Luis Machado <luisgpm@br.ibm.com>
* sysdeps/powerpc/powerpc32/power6/fpu/s_copysign.S: New file. * sysdeps/powerpc/powerpc32/power6/fpu/s_copysign.S: New file.

View File

@ -7,7 +7,7 @@ ifeq ($(subdir),string)
sysdep_routines += stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \ sysdep_routines += stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \ strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \
memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \ memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
memmove-ssse3-back memmove-ssse3-back strcasestr-nonascii
ifeq (yes,$(config-cflags-sse4)) ifeq (yes,$(config-cflags-sse4))
sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
CFLAGS-strcspn-c.c += -msse4 CFLAGS-strcspn-c.c += -msse4
@ -15,5 +15,6 @@ CFLAGS-strpbrk-c.c += -msse4
CFLAGS-strspn-c.c += -msse4 CFLAGS-strspn-c.c += -msse4
CFLAGS-strstr.c += -msse4 CFLAGS-strstr.c += -msse4
CFLAGS-strcasestr.c += -msse4 CFLAGS-strcasestr.c += -msse4
CFLAGS-strcasestr-nonascii.c += -msse4
endif endif
endif endif

View File

@ -0,0 +1,50 @@
/* strstr with SSE4.2 intrinsics
Copyright (C) 2010 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, write to the Free
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */
# include <ctype.h>
/* Similar to __m128i_strloadu. Convert to lower case for none-POSIX/C
locale. */
static inline __m128i
__m128i_strloadu_tolower (const unsigned char * p)
{
union
{
char b[16];
__m128i x;
} u;
for (int i = 0; i < 16; ++i)
if (p[i] == 0)
{
u.b[i] = 0;
break;
}
else
u.b[i] = tolower (p[i]);
return u.x;
}
#define STRCASESTR_NONASCII
#define USE_AS_STRCASESTR
#define STRSTR_SSE42 attribute_hidden __strcasestr_sse42_nonascii
#include "strstr.c"

View File

@ -1,3 +1,7 @@
extern char *__strcasestr_sse42_nonascii (const unsigned char *s1,
const unsigned char *s2)
attribute_hidden;
#define USE_AS_STRCASESTR #define USE_AS_STRCASESTR
#define STRSTR_SSE42 __strcasestr_sse42 #define STRSTR_SSE42 __strcasestr_sse42
#include "strstr.c" #include "strstr.c"

View File

@ -1,5 +1,5 @@
/* strstr with SSE4.2 intrinsics /* strstr with SSE4.2 intrinsics
Copyright (C) 2009 Free Software Foundation, Inc. Copyright (C) 2009, 2010 Free Software Foundation, Inc.
Contributed by Intel Corporation. Contributed by Intel Corporation.
This file is part of the GNU C Library. This file is part of the GNU C Library.
@ -67,10 +67,10 @@
case ECX CFlag ZFlag SFlag case ECX CFlag ZFlag SFlag
3 X 1 0 0/1 3 X 1 0 0/1
4a 0 1 0 0 4a 0 1 0 0
4b 0 1 0 1 4b 0 1 0 1
4c 0 < X 1 0 0/1 4c 0 < X 1 0 0/1
5 16 0 1 0 5 16 0 1 0
3. An initial ordered-comparison fragment match, we fix up to do 3. An initial ordered-comparison fragment match, we fix up to do
subsequent string comparison subsequent string comparison
@ -147,8 +147,7 @@ __m128i_shift_right (__m128i value, int offset)
If EOS occurs within less than 16B before 4KB boundary, we don't If EOS occurs within less than 16B before 4KB boundary, we don't
cross to next page. */ cross to next page. */
static __m128i static inline __m128i
__attribute__ ((section (".text.sse4.2")))
__m128i_strloadu (const unsigned char * p) __m128i_strloadu (const unsigned char * p)
{ {
int offset = ((size_t) p & (16 - 1)); int offset = ((size_t) p & (16 - 1));
@ -164,14 +163,12 @@ __m128i_strloadu (const unsigned char * p)
return _mm_loadu_si128 ((__m128i *) p); return _mm_loadu_si128 ((__m128i *) p);
} }
#ifdef USE_AS_STRCASESTR #if defined USE_AS_STRCASESTR && !defined STRCASESTR_NONASCII
/* Similar to __m128i_strloadu. Convert to lower case for POSIX/C /* Similar to __m128i_strloadu. Convert to lower case for POSIX/C
locale. */ locale. */
static inline __m128i
static __m128i __m128i_strloadu_tolower (const unsigned char * p)
__attribute__ ((section (".text.sse4.2")))
__m128i_strloadu_tolower_posix (const unsigned char * p)
{ {
__m128i frag = __m128i_strloadu (p); __m128i frag = __m128i_strloadu (p);
@ -184,39 +181,13 @@ __m128i_strloadu_tolower_posix (const unsigned char * p)
return _mm_blendv_epi8 (frag, mask2, mask1); return _mm_blendv_epi8 (frag, mask2, mask1);
} }
/* Similar to __m128i_strloadu. Convert to lower case for none-POSIX/C
locale. */
static __m128i
__attribute__ ((section (".text.sse4.2")))
__m128i_strloadu_tolower (const unsigned char * p)
{
union
{
char b[16];
__m128i x;
} u;
for (int i = 0; i < 16; i++)
if (p[i] == 0)
{
u.b[i] = 0;
break;
}
else
u.b[i] = tolower (p[i]);
return u.x;
}
#endif #endif
/* Calculate Knuth-Morris-Pratt string searching algorithm (or KMP /* Calculate Knuth-Morris-Pratt string searching algorithm (or KMP
algorithm) overlap for a fully populated 16B vector. algorithm) overlap for a fully populated 16B vector.
Input parameter: 1st 16Byte loaded from the reference string of a Input parameter: 1st 16Byte loaded from the reference string of a
strstr function. strstr function.
We don't use KMP algorithm if reference string is less than 16B. We don't use KMP algorithm if reference string is less than 16B. */
*/
static int static int
__inline__ __attribute__ ((__always_inline__,)) __inline__ __attribute__ ((__always_inline__,))
KMP16Bovrlap (__m128i s2) KMP16Bovrlap (__m128i s2)
@ -236,7 +207,7 @@ KMP16Bovrlap (__m128i s2)
return 1; return 1;
else if (!k1) else if (!k1)
{ {
/* There are al least two ditinct char in s2. If byte 0 and 1 are /* There are al least two distinct chars in s2. If byte 0 and 1 are
idential and the distinct value lies farther down, we can deduce idential and the distinct value lies farther down, we can deduce
the next byte offset to restart full compare is least no earlier the next byte offset to restart full compare is least no earlier
than byte 3. */ than byte 3. */
@ -256,23 +227,24 @@ STRSTR_SSE42 (const unsigned char *s1, const unsigned char *s2)
#define p1 s1 #define p1 s1
const unsigned char *p2 = s2; const unsigned char *p2 = s2;
if (p2[0] == '\0') #ifndef STRCASESTR_NONASCII
if (__builtin_expect (p2[0] == '\0', 0))
return (char *) p1; return (char *) p1;
if (p1[0] == '\0') if (__builtin_expect (p1[0] == '\0', 0))
return NULL; return NULL;
/* Check if p1 length is 1 byte long. */ /* Check if p1 length is 1 byte long. */
if (p1[1] == '\0') if (__builtin_expect (p1[1] == '\0', 0))
return p2[1] == '\0' && CMPBYTE (p1[0], p2[0]) ? (char *) p1 : NULL; return p2[1] == '\0' && CMPBYTE (p1[0], p2[0]) ? (char *) p1 : NULL;
#endif
#ifdef USE_AS_STRCASESTR #ifdef USE_AS_STRCASESTR
__m128i (*strloadu) (const unsigned char *); if (__builtin_expect (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_NONASCII_CASE)
!= 0, 0))
return __strcasestr_sse42_nonascii (s1, s2);
if (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_NONASCII_CASE) == 0) # define strloadu __m128i_strloadu_tolower
strloadu = __m128i_strloadu_tolower_posix;
else
strloadu = __m128i_strloadu_tolower;
#else #else
# define strloadu __m128i_strloadu # define strloadu __m128i_strloadu
#endif #endif