mirror of
git://sourceware.org/git/glibc.git
synced 2024-11-27 03:41:23 +08:00
Clean up SSE variable shifts
This commit is contained in:
parent
84b9230c40
commit
73f27d5e72
17
ChangeLog
17
ChangeLog
@ -1,3 +1,20 @@
|
||||
2010-08-24 Richard Henderson <rth@redhat.com>
|
||||
Ulrich Drepper <drepper@redhat.com>
|
||||
H.J. Lu <hongjiu.lu@intel.com>
|
||||
|
||||
* sysdeps/i386/i686/multiarch/Makefile (sysdep_routines): Add varshift.
|
||||
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Likewise.
|
||||
* sysdeps/x86_64/multiarch/strcspn-c.c: Include "varshift.h".
|
||||
Replace _mm_srli_si128 with __m128i_shift_right. Replace
|
||||
_mm_alignr_epi8 with _mm_loadu_si128.
|
||||
* sysdeps/x86_64/multiarch/strspn-c.c: Likewise.
|
||||
* sysdeps/x86_64/multiarch/strstr.c: Include "varshift.h".
|
||||
(__m128i_shift_right): Removed.
|
||||
* sysdeps/i386/i686/multiarch/varshift.h: New file.
|
||||
* sysdeps/i386/i686/multiarch/varshift.S: New file.
|
||||
* sysdeps/x86_64/multiarch/varshift.h: New file.
|
||||
* sysdeps/x86_64/multiarch/varshift.S: New file.
|
||||
|
||||
2010-08-21 Mike Frysinger <vapier@gentoo.org>
|
||||
|
||||
* configure.in: Move assembler checks to before sysdep dir checking.
|
||||
|
@ -9,7 +9,7 @@ sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \
|
||||
memmove-ssse3-rep bcopy-ssse3 bcopy-ssse3-rep \
|
||||
memset-sse2-rep bzero-sse2-rep strcmp-ssse3 \
|
||||
strcmp-sse4 strncmp-c strncmp-ssse3 strncmp-sse4 \
|
||||
memcmp-ssse3 memcmp-sse4 strcasestr-nonascii
|
||||
memcmp-ssse3 memcmp-sse4 strcasestr-nonascii varshift
|
||||
ifeq (yes,$(config-cflags-sse4))
|
||||
sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
|
||||
CFLAGS-strcspn-c.c += -msse4
|
||||
|
1
sysdeps/i386/i686/multiarch/varshift.S
Normal file
1
sysdeps/i386/i686/multiarch/varshift.S
Normal file
@ -0,0 +1 @@
|
||||
#include <sysdeps/x86_64/multiarch/varshift.S>
|
1
sysdeps/i386/i686/multiarch/varshift.h
Normal file
1
sysdeps/i386/i686/multiarch/varshift.h
Normal file
@ -0,0 +1 @@
|
||||
#include <sysdeps/x86_64/multiarch/varshift.h>
|
@ -10,7 +10,7 @@ sysdep_routines += stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
|
||||
memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
|
||||
strncase_l-ssse3
|
||||
ifeq (yes,$(config-cflags-sse4))
|
||||
sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
|
||||
sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
|
||||
CFLAGS-strcspn-c.c += -msse4
|
||||
CFLAGS-strpbrk-c.c += -msse4
|
||||
CFLAGS-strspn-c.c += -msse4
|
||||
|
@ -20,6 +20,7 @@
|
||||
|
||||
#include <nmmintrin.h>
|
||||
#include <string.h>
|
||||
#include "varshift.h"
|
||||
|
||||
/* We use 0x2:
|
||||
_SIDD_SBYTE_OPS
|
||||
@ -86,8 +87,6 @@ STRCSPN_SSE42 (const char *s, const char *a)
|
||||
|
||||
const char *aligned;
|
||||
__m128i mask;
|
||||
/* Fake initialization. gcc otherwise will warn. */
|
||||
asm ("" : "=xm" (mask));
|
||||
int offset = (int) ((size_t) a & 15);
|
||||
if (offset != 0)
|
||||
{
|
||||
@ -95,54 +94,7 @@ STRCSPN_SSE42 (const char *s, const char *a)
|
||||
aligned = (const char *) ((size_t) a & -16L);
|
||||
__m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
|
||||
|
||||
switch (offset)
|
||||
{
|
||||
case 1:
|
||||
mask = _mm_srli_si128 (mask0, 1);
|
||||
break;
|
||||
case 2:
|
||||
mask = _mm_srli_si128 (mask0, 2);
|
||||
break;
|
||||
case 3:
|
||||
mask = _mm_srli_si128 (mask0, 3);
|
||||
break;
|
||||
case 4:
|
||||
mask = _mm_srli_si128 (mask0, 4);
|
||||
break;
|
||||
case 5:
|
||||
mask = _mm_srli_si128 (mask0, 5);
|
||||
break;
|
||||
case 6:
|
||||
mask = _mm_srli_si128 (mask0, 6);
|
||||
break;
|
||||
case 7:
|
||||
mask = _mm_srli_si128 (mask0, 7);
|
||||
break;
|
||||
case 8:
|
||||
mask = _mm_srli_si128 (mask0, 8);
|
||||
break;
|
||||
case 9:
|
||||
mask = _mm_srli_si128 (mask0, 9);
|
||||
break;
|
||||
case 10:
|
||||
mask = _mm_srli_si128 (mask0, 10);
|
||||
break;
|
||||
case 11:
|
||||
mask = _mm_srli_si128 (mask0, 11);
|
||||
break;
|
||||
case 12:
|
||||
mask = _mm_srli_si128 (mask0, 12);
|
||||
break;
|
||||
case 13:
|
||||
mask = _mm_srli_si128 (mask0, 13);
|
||||
break;
|
||||
case 14:
|
||||
mask = _mm_srli_si128 (mask0, 14);
|
||||
break;
|
||||
case 15:
|
||||
mask = _mm_srli_si128 (mask0, 15);
|
||||
break;
|
||||
}
|
||||
mask = __m128i_shift_right (mask0, offset);
|
||||
|
||||
/* Find where the NULL terminator is. */
|
||||
int length = _mm_cmpistri (mask, mask, 0x3a);
|
||||
@ -159,55 +111,10 @@ STRCSPN_SSE42 (const char *s, const char *a)
|
||||
|
||||
if (index != 0)
|
||||
{
|
||||
/* Combine mask0 and mask1. */
|
||||
switch (offset)
|
||||
{
|
||||
case 1:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 1);
|
||||
break;
|
||||
case 2:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 2);
|
||||
break;
|
||||
case 3:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 3);
|
||||
break;
|
||||
case 4:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 4);
|
||||
break;
|
||||
case 5:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 5);
|
||||
break;
|
||||
case 6:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 6);
|
||||
break;
|
||||
case 7:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 7);
|
||||
break;
|
||||
case 8:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 8);
|
||||
break;
|
||||
case 9:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 9);
|
||||
break;
|
||||
case 10:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 10);
|
||||
break;
|
||||
case 11:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 11);
|
||||
break;
|
||||
case 12:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 12);
|
||||
break;
|
||||
case 13:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 13);
|
||||
break;
|
||||
case 14:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 14);
|
||||
break;
|
||||
case 15:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 15);
|
||||
break;
|
||||
}
|
||||
/* Combine mask0 and mask1. We could play games with
|
||||
palignr, but frankly this data should be in L1 now
|
||||
so do the merge via an unaligned load. */
|
||||
mask = _mm_loadu_si128 ((__m128i *) a);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -234,54 +141,7 @@ STRCSPN_SSE42 (const char *s, const char *a)
|
||||
aligned = (const char *) ((size_t) s & -16L);
|
||||
__m128i value = _mm_load_si128 ((__m128i *) aligned);
|
||||
|
||||
switch (offset)
|
||||
{
|
||||
case 1:
|
||||
value = _mm_srli_si128 (value, 1);
|
||||
break;
|
||||
case 2:
|
||||
value = _mm_srli_si128 (value, 2);
|
||||
break;
|
||||
case 3:
|
||||
value = _mm_srli_si128 (value, 3);
|
||||
break;
|
||||
case 4:
|
||||
value = _mm_srli_si128 (value, 4);
|
||||
break;
|
||||
case 5:
|
||||
value = _mm_srli_si128 (value, 5);
|
||||
break;
|
||||
case 6:
|
||||
value = _mm_srli_si128 (value, 6);
|
||||
break;
|
||||
case 7:
|
||||
value = _mm_srli_si128 (value, 7);
|
||||
break;
|
||||
case 8:
|
||||
value = _mm_srli_si128 (value, 8);
|
||||
break;
|
||||
case 9:
|
||||
value = _mm_srli_si128 (value, 9);
|
||||
break;
|
||||
case 10:
|
||||
value = _mm_srli_si128 (value, 10);
|
||||
break;
|
||||
case 11:
|
||||
value = _mm_srli_si128 (value, 11);
|
||||
break;
|
||||
case 12:
|
||||
value = _mm_srli_si128 (value, 12);
|
||||
break;
|
||||
case 13:
|
||||
value = _mm_srli_si128 (value, 13);
|
||||
break;
|
||||
case 14:
|
||||
value = _mm_srli_si128 (value, 14);
|
||||
break;
|
||||
case 15:
|
||||
value = _mm_srli_si128 (value, 15);
|
||||
break;
|
||||
}
|
||||
value = __m128i_shift_right (value, offset);
|
||||
|
||||
int length = _mm_cmpistri (mask, value, 0x2);
|
||||
/* No need to check ZFlag since ZFlag is always 1. */
|
||||
|
@ -20,6 +20,7 @@
|
||||
|
||||
#include <nmmintrin.h>
|
||||
#include <string.h>
|
||||
#include "varshift.h"
|
||||
|
||||
/* We use 0x12:
|
||||
_SIDD_SBYTE_OPS
|
||||
@ -71,54 +72,7 @@ __strspn_sse42 (const char *s, const char *a)
|
||||
aligned = (const char *) ((size_t) a & -16L);
|
||||
__m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
|
||||
|
||||
switch (offset)
|
||||
{
|
||||
case 1:
|
||||
mask = _mm_srli_si128 (mask0, 1);
|
||||
break;
|
||||
case 2:
|
||||
mask = _mm_srli_si128 (mask0, 2);
|
||||
break;
|
||||
case 3:
|
||||
mask = _mm_srli_si128 (mask0, 3);
|
||||
break;
|
||||
case 4:
|
||||
mask = _mm_srli_si128 (mask0, 4);
|
||||
break;
|
||||
case 5:
|
||||
mask = _mm_srli_si128 (mask0, 5);
|
||||
break;
|
||||
case 6:
|
||||
mask = _mm_srli_si128 (mask0, 6);
|
||||
break;
|
||||
case 7:
|
||||
mask = _mm_srli_si128 (mask0, 7);
|
||||
break;
|
||||
case 8:
|
||||
mask = _mm_srli_si128 (mask0, 8);
|
||||
break;
|
||||
case 9:
|
||||
mask = _mm_srli_si128 (mask0, 9);
|
||||
break;
|
||||
case 10:
|
||||
mask = _mm_srli_si128 (mask0, 10);
|
||||
break;
|
||||
case 11:
|
||||
mask = _mm_srli_si128 (mask0, 11);
|
||||
break;
|
||||
case 12:
|
||||
mask = _mm_srli_si128 (mask0, 12);
|
||||
break;
|
||||
case 13:
|
||||
mask = _mm_srli_si128 (mask0, 13);
|
||||
break;
|
||||
case 14:
|
||||
mask = _mm_srli_si128 (mask0, 14);
|
||||
break;
|
||||
case 15:
|
||||
mask = _mm_srli_si128 (mask0, 15);
|
||||
break;
|
||||
}
|
||||
mask = __m128i_shift_right (mask0, offset);
|
||||
|
||||
/* Find where the NULL terminator is. */
|
||||
int length = _mm_cmpistri (mask, mask, 0x3a);
|
||||
@ -135,55 +89,10 @@ __strspn_sse42 (const char *s, const char *a)
|
||||
|
||||
if (index != 0)
|
||||
{
|
||||
/* Combine mask0 and mask1. */
|
||||
switch (offset)
|
||||
{
|
||||
case 1:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 1);
|
||||
break;
|
||||
case 2:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 2);
|
||||
break;
|
||||
case 3:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 3);
|
||||
break;
|
||||
case 4:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 4);
|
||||
break;
|
||||
case 5:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 5);
|
||||
break;
|
||||
case 6:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 6);
|
||||
break;
|
||||
case 7:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 7);
|
||||
break;
|
||||
case 8:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 8);
|
||||
break;
|
||||
case 9:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 9);
|
||||
break;
|
||||
case 10:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 10);
|
||||
break;
|
||||
case 11:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 11);
|
||||
break;
|
||||
case 12:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 12);
|
||||
break;
|
||||
case 13:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 13);
|
||||
break;
|
||||
case 14:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 14);
|
||||
break;
|
||||
case 15:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 15);
|
||||
break;
|
||||
}
|
||||
/* Combine mask0 and mask1. We could play games with
|
||||
palignr, but frankly this data should be in L1 now
|
||||
so do the merge via an unaligned load. */
|
||||
mask = _mm_loadu_si128 ((__m128i *) a);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -210,54 +119,7 @@ __strspn_sse42 (const char *s, const char *a)
|
||||
aligned = (const char *) ((size_t) s & -16L);
|
||||
__m128i value = _mm_load_si128 ((__m128i *) aligned);
|
||||
|
||||
switch (offset)
|
||||
{
|
||||
case 1:
|
||||
value = _mm_srli_si128 (value, 1);
|
||||
break;
|
||||
case 2:
|
||||
value = _mm_srli_si128 (value, 2);
|
||||
break;
|
||||
case 3:
|
||||
value = _mm_srli_si128 (value, 3);
|
||||
break;
|
||||
case 4:
|
||||
value = _mm_srli_si128 (value, 4);
|
||||
break;
|
||||
case 5:
|
||||
value = _mm_srli_si128 (value, 5);
|
||||
break;
|
||||
case 6:
|
||||
value = _mm_srli_si128 (value, 6);
|
||||
break;
|
||||
case 7:
|
||||
value = _mm_srli_si128 (value, 7);
|
||||
break;
|
||||
case 8:
|
||||
value = _mm_srli_si128 (value, 8);
|
||||
break;
|
||||
case 9:
|
||||
value = _mm_srli_si128 (value, 9);
|
||||
break;
|
||||
case 10:
|
||||
value = _mm_srli_si128 (value, 10);
|
||||
break;
|
||||
case 11:
|
||||
value = _mm_srli_si128 (value, 11);
|
||||
break;
|
||||
case 12:
|
||||
value = _mm_srli_si128 (value, 12);
|
||||
break;
|
||||
case 13:
|
||||
value = _mm_srli_si128 (value, 13);
|
||||
break;
|
||||
case 14:
|
||||
value = _mm_srli_si128 (value, 14);
|
||||
break;
|
||||
case 15:
|
||||
value = _mm_srli_si128 (value, 15);
|
||||
break;
|
||||
}
|
||||
value = __m128i_shift_right (value, offset);
|
||||
|
||||
int length = _mm_cmpistri (mask, value, 0x12);
|
||||
/* No need to check CFlag since it is always 1. */
|
||||
|
@ -19,6 +19,7 @@
|
||||
02111-1307 USA. */
|
||||
|
||||
#include <nmmintrin.h>
|
||||
#include "varshift.h"
|
||||
|
||||
#ifndef STRSTR_SSE42
|
||||
# define STRSTR_SSE42 __strstr_sse42
|
||||
@ -82,67 +83,6 @@
|
||||
5. failed string compare, go back to scanning
|
||||
*/
|
||||
|
||||
/* Fix-up of removal of unneeded data due to 16B aligned load
|
||||
parameters:
|
||||
value: 16B data loaded from 16B aligned address.
|
||||
offset: Offset of target data address relative to 16B aligned load
|
||||
address.
|
||||
*/
|
||||
|
||||
static __inline__ __m128i
|
||||
__m128i_shift_right (__m128i value, int offset)
|
||||
{
|
||||
switch (offset)
|
||||
{
|
||||
case 1:
|
||||
value = _mm_srli_si128 (value, 1);
|
||||
break;
|
||||
case 2:
|
||||
value = _mm_srli_si128 (value, 2);
|
||||
break;
|
||||
case 3:
|
||||
value = _mm_srli_si128 (value, 3);
|
||||
break;
|
||||
case 4:
|
||||
value = _mm_srli_si128 (value, 4);
|
||||
break;
|
||||
case 5:
|
||||
value = _mm_srli_si128 (value, 5);
|
||||
break;
|
||||
case 6:
|
||||
value = _mm_srli_si128 (value, 6);
|
||||
break;
|
||||
case 7:
|
||||
value = _mm_srli_si128 (value, 7);
|
||||
break;
|
||||
case 8:
|
||||
value = _mm_srli_si128 (value, 8);
|
||||
break;
|
||||
case 9:
|
||||
value = _mm_srli_si128 (value, 9);
|
||||
break;
|
||||
case 10:
|
||||
value = _mm_srli_si128 (value, 10);
|
||||
break;
|
||||
case 11:
|
||||
value = _mm_srli_si128 (value, 11);
|
||||
break;
|
||||
case 12:
|
||||
value = _mm_srli_si128 (value, 12);
|
||||
break;
|
||||
case 13:
|
||||
value = _mm_srli_si128 (value, 13);
|
||||
break;
|
||||
case 14:
|
||||
value = _mm_srli_si128 (value, 14);
|
||||
break;
|
||||
case 15:
|
||||
value = _mm_srli_si128 (value, 15);
|
||||
break;
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
/* Simple replacement of movdqu to address 4KB boundary cross issue.
|
||||
If EOS occurs within less than 16B before 4KB boundary, we don't
|
||||
cross to next page. */
|
||||
|
30
sysdeps/x86_64/multiarch/varshift.S
Normal file
30
sysdeps/x86_64/multiarch/varshift.S
Normal file
@ -0,0 +1,30 @@
|
||||
/* Helper for variable shifts of SSE registers.
|
||||
Copyright (C) 2010 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, write to the Free
|
||||
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
|
||||
02111-1307 USA. */
|
||||
|
||||
#include <sysdep.h>
|
||||
|
||||
|
||||
.section .rodata
|
||||
.hidden ___m128i_shift_right
|
||||
.globl ___m128i_shift_right
|
||||
.size ___m128i_shift_right, 31
|
||||
|
||||
___m128i_shift_right:
|
||||
.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
||||
.byte -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
|
27
sysdeps/x86_64/multiarch/varshift.h
Normal file
27
sysdeps/x86_64/multiarch/varshift.h
Normal file
@ -0,0 +1,27 @@
|
||||
/* Helper for variable shifts of SSE registers.
|
||||
Copyright (C) 2010 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, write to the Free
|
||||
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
|
||||
02111-1307 USA. */
|
||||
|
||||
|
||||
extern char ___m128i_shift_right[31] __attribute__((visibility("hidden")));
|
||||
|
||||
static __inline__ __m128i
|
||||
__m128i_shift_right (__m128i value, unsigned long offset)
|
||||
{
|
||||
return _mm_shuffle_epi8 (value, _mm_loadu_si128 ((__m128 *) (___m128i_shift_right + offset)));
|
||||
}
|
Loading…
Reference in New Issue
Block a user