mirror of
git://sourceware.org/git/glibc.git
synced 2025-01-30 12:31:53 +08:00
Faster strchr implementation.
This commit is contained in:
parent
b987c77672
commit
5905e7b3e2
11
ChangeLog
11
ChangeLog
@ -1,3 +1,12 @@
|
||||
2013-09-11 Ondřej Bílka <neleai@seznam.cz>
|
||||
|
||||
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
(__libc_ifunc_impl_list): Remove: __strchr_sse42.
|
||||
* sysdeps/x86_64/multiarch/strchr.S (__strchr_sse42): Remove.
|
||||
(strchr): Remove __strchr_sse42 ifunc selection.
|
||||
* sysdeps/x86_64/strchr.S (strchr): Use optimized implementation.
|
||||
* sysdeps/x86_64/strchrnul.S: Include sysdeps/x86_64/strchr.S.
|
||||
|
||||
2013-09-11 Will Newton <will.newton@linaro.org>
|
||||
|
||||
* benchtests/bench-timing.h (TIMING_INIT): Rename ITERS
|
||||
@ -35,7 +44,7 @@
|
||||
* malloc/malloc.c (__libc_pvalloc): Check the value of bytes
|
||||
does not overflow.
|
||||
|
||||
2013-09-10 Ondřej Bílka <neleai@seznam.cz>
|
||||
2013-09-10 Ondřej Bílka <neleai@seznam.cz>
|
||||
|
||||
* sysdeps/ieee754/dbl-64/e_j0.c: Remove DO_NOT_USE_THIS conditionals.
|
||||
* sysdeps/ieee754/dbl-64/e_j1.c: Likewise.
|
||||
|
@ -110,7 +110,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
|
||||
/* Support sysdeps/x86_64/multiarch/strchr.S. */
|
||||
IFUNC_IMPL (i, name, strchr,
|
||||
IFUNC_IMPL_ADD (array, i, strchr, HAS_SSE4_2, __strchr_sse42)
|
||||
IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2_no_bsf)
|
||||
IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2))
|
||||
|
||||
|
@ -29,12 +29,6 @@ ENTRY(strchr)
|
||||
jne 1f
|
||||
call __init_cpu_features
|
||||
1: leaq __strchr_sse2(%rip), %rax
|
||||
testl $bit_Slow_SSE4_2, __cpu_features+CPUID_OFFSET+index_Slow_SSE4_2(%rip)
|
||||
jnz 2f
|
||||
testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
|
||||
jz 2f
|
||||
leaq __strchr_sse42(%rip), %rax
|
||||
ret
|
||||
2: testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
|
||||
jz 3f
|
||||
leaq __strchr_sse2_no_bsf(%rip), %rax
|
||||
@ -42,127 +36,6 @@ ENTRY(strchr)
|
||||
END(strchr)
|
||||
|
||||
|
||||
/*
|
||||
This implementation uses SSE4 instructions to compare up to 16 bytes
|
||||
at a time looking for the first occurrence of the character c in the
|
||||
string s:
|
||||
|
||||
char *strchr (const char *s, int c);
|
||||
|
||||
We use 0xa:
|
||||
_SIDD_SBYTE_OPS
|
||||
| _SIDD_CMP_EQUAL_EACH
|
||||
| _SIDD_LEAST_SIGNIFICANT
|
||||
on pcmpistri to compare xmm/mem128
|
||||
|
||||
0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
X X X X X X X X X X X X X X X X
|
||||
|
||||
against xmm
|
||||
|
||||
0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
C C C C C C C C C C C C C C C C
|
||||
|
||||
to find out if the first 16byte data element has a byte C and the
|
||||
offset of the first byte. There are 3 cases:
|
||||
|
||||
1. The first 16byte data element has the byte C at the offset X.
|
||||
2. The first 16byte data element has EOS and doesn't have the byte C.
|
||||
3. The first 16byte data element is valid and doesn't have the byte C.
|
||||
|
||||
Here is the table of ECX, CFlag, ZFlag and SFlag for 3 cases:
|
||||
|
||||
case ECX CFlag ZFlag SFlag
|
||||
1 X 1 0/1 0
|
||||
2 16 0 1 0
|
||||
3 16 0 0 0
|
||||
|
||||
We exit from the loop for cases 1 and 2 with jbe which branches
|
||||
when either CFlag or ZFlag is 1. If CFlag == 1, ECX has the offset
|
||||
X for case 1. */
|
||||
|
||||
.section .text.sse4.2,"ax",@progbits
|
||||
.align 16
|
||||
.type __strchr_sse42, @function
|
||||
.globl __strchr_sse42
|
||||
.hidden __strchr_sse42
|
||||
__strchr_sse42:
|
||||
cfi_startproc
|
||||
CALL_MCOUNT
|
||||
testb %sil, %sil
|
||||
je __strend_sse4
|
||||
pxor %xmm2, %xmm2
|
||||
movd %esi, %xmm1
|
||||
movl %edi, %ecx
|
||||
pshufb %xmm2, %xmm1
|
||||
andl $15, %ecx
|
||||
movq %rdi, %r8
|
||||
je L(aligned_start)
|
||||
|
||||
/* Handle unaligned string. */
|
||||
andq $-16, %r8
|
||||
movdqa (%r8), %xmm0
|
||||
pcmpeqb %xmm0, %xmm2
|
||||
pcmpeqb %xmm1, %xmm0
|
||||
/* Find where NULL is. */
|
||||
pmovmskb %xmm2, %edx
|
||||
/* Check if there is a match. */
|
||||
pmovmskb %xmm0, %esi
|
||||
/* Remove the leading bytes. */
|
||||
sarl %cl, %edx
|
||||
sarl %cl, %esi
|
||||
testl %esi, %esi
|
||||
je L(unaligned_no_match)
|
||||
/* Check which byte is a match. */
|
||||
bsfl %esi, %eax
|
||||
/* Is there a NULL? */
|
||||
testl %edx, %edx
|
||||
je L(unaligned_match)
|
||||
bsfl %edx, %esi
|
||||
cmpl %esi, %eax
|
||||
/* Return NULL if NULL comes first. */
|
||||
ja L(return_null)
|
||||
L(unaligned_match):
|
||||
addq %rdi, %rax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(unaligned_no_match):
|
||||
testl %edx, %edx
|
||||
jne L(return_null)
|
||||
|
||||
/* Loop start on aligned string. */
|
||||
L(loop):
|
||||
addq $16, %r8
|
||||
L(aligned_start):
|
||||
pcmpistri $0x2, (%r8), %xmm1
|
||||
jbe L(wrap)
|
||||
addq $16, %r8
|
||||
pcmpistri $0x2, (%r8), %xmm1
|
||||
jbe L(wrap)
|
||||
addq $16, %r8
|
||||
pcmpistri $0x2, (%r8), %xmm1
|
||||
jbe L(wrap)
|
||||
addq $16, %r8
|
||||
pcmpistri $0x2, (%r8), %xmm1
|
||||
jbe L(wrap)
|
||||
jmp L(loop)
|
||||
L(wrap):
|
||||
jc L(loop_exit)
|
||||
|
||||
/* Return NULL. */
|
||||
L(return_null):
|
||||
xorl %eax, %eax
|
||||
ret
|
||||
|
||||
/* Loop exit. */
|
||||
.p2align 4
|
||||
L(loop_exit):
|
||||
leaq (%r8,%rcx), %rax
|
||||
ret
|
||||
cfi_endproc
|
||||
.size __strchr_sse42, .-__strchr_sse42
|
||||
|
||||
|
||||
# undef ENTRY
|
||||
# define ENTRY(name) \
|
||||
|
@ -19,51 +19,174 @@
|
||||
|
||||
#include <sysdep.h>
|
||||
|
||||
# ifndef ALIGN
|
||||
# define ALIGN(n) .p2align n
|
||||
# endif
|
||||
|
||||
|
||||
.text
|
||||
ENTRY (strchr)
|
||||
movd %esi, %xmm1
|
||||
movq %rdi, %rcx
|
||||
movl %edi, %eax
|
||||
andl $4095, %eax
|
||||
punpcklbw %xmm1, %xmm1
|
||||
andq $~15, %rdi
|
||||
pxor %xmm2, %xmm2
|
||||
punpcklbw %xmm1, %xmm1
|
||||
orl $0xffffffff, %esi
|
||||
movdqa (%rdi), %xmm0
|
||||
cmpl $4032, %eax
|
||||
punpcklwd %xmm1, %xmm1
|
||||
pshufd $0, %xmm1, %xmm1
|
||||
subq %rdi, %rcx
|
||||
movdqa %xmm0, %xmm3
|
||||
leaq 16(%rdi), %rdi
|
||||
jg L(cross_page)
|
||||
movdqu (%rdi), %xmm0
|
||||
pxor %xmm3, %xmm3
|
||||
movdqa %xmm0, %xmm4
|
||||
pcmpeqb %xmm1, %xmm0
|
||||
pcmpeqb %xmm2, %xmm3
|
||||
shl %cl, %esi
|
||||
pmovmskb %xmm0, %edx
|
||||
pmovmskb %xmm3, %ecx
|
||||
andl %esi, %edx
|
||||
andl %esi, %ecx
|
||||
orl %edx, %ecx
|
||||
jnz 1f
|
||||
pcmpeqb %xmm3, %xmm4
|
||||
por %xmm4, %xmm0
|
||||
pmovmskb %xmm0, %eax
|
||||
test %eax, %eax
|
||||
je L(next_48_bytes)
|
||||
bsf %eax, %eax
|
||||
#ifdef AS_STRCHRNUL
|
||||
leaq (%rdi,%rax), %rax
|
||||
#else
|
||||
movl $0, %edx
|
||||
leaq (%rdi,%rax), %rax
|
||||
cmpb %sil, (%rax)
|
||||
cmovne %rdx, %rax
|
||||
#endif
|
||||
ret
|
||||
|
||||
2: movdqa (%rdi), %xmm0
|
||||
leaq 16(%rdi), %rdi
|
||||
movdqa %xmm0, %xmm3
|
||||
ALIGN(3)
|
||||
L(next_48_bytes):
|
||||
movdqu 16(%rdi), %xmm0
|
||||
movdqa %xmm0, %xmm4
|
||||
pcmpeqb %xmm1, %xmm0
|
||||
pcmpeqb %xmm2, %xmm3
|
||||
pmovmskb %xmm0, %edx
|
||||
pmovmskb %xmm3, %ecx
|
||||
orl %edx, %ecx
|
||||
jz 2b
|
||||
pcmpeqb %xmm3, %xmm4
|
||||
por %xmm4, %xmm0
|
||||
pmovmskb %xmm0, %ecx
|
||||
movdqu 32(%rdi), %xmm0
|
||||
movdqa %xmm0, %xmm4
|
||||
pcmpeqb %xmm1, %xmm0
|
||||
salq $16, %rcx
|
||||
pcmpeqb %xmm3, %xmm4
|
||||
por %xmm4, %xmm0
|
||||
pmovmskb %xmm0, %eax
|
||||
movdqu 48(%rdi), %xmm0
|
||||
pcmpeqb %xmm0, %xmm3
|
||||
salq $32, %rax
|
||||
pcmpeqb %xmm1, %xmm0
|
||||
orq %rcx, %rax
|
||||
por %xmm3, %xmm0
|
||||
pmovmskb %xmm0, %ecx
|
||||
salq $48, %rcx
|
||||
orq %rcx, %rax
|
||||
testq %rax, %rax
|
||||
jne L(return)
|
||||
L(loop_start):
|
||||
/* We use this alignment to force loop be aligned to 8 but not
|
||||
16 bytes. This gives better sheduling on AMD processors. */
|
||||
ALIGN(4)
|
||||
pxor %xmm6, %xmm6
|
||||
andq $-64, %rdi
|
||||
ALIGN(3)
|
||||
L(loop64):
|
||||
addq $64, %rdi
|
||||
movdqa (%rdi), %xmm5
|
||||
movdqa 16(%rdi), %xmm2
|
||||
movdqa 32(%rdi), %xmm3
|
||||
pxor %xmm1, %xmm5
|
||||
movdqa 48(%rdi), %xmm4
|
||||
pxor %xmm1, %xmm2
|
||||
pxor %xmm1, %xmm3
|
||||
pminub (%rdi), %xmm5
|
||||
pxor %xmm1, %xmm4
|
||||
pminub 16(%rdi), %xmm2
|
||||
pminub 32(%rdi), %xmm3
|
||||
pminub %xmm2, %xmm5
|
||||
pminub 48(%rdi), %xmm4
|
||||
pminub %xmm3, %xmm5
|
||||
pminub %xmm4, %xmm5
|
||||
pcmpeqb %xmm6, %xmm5
|
||||
pmovmskb %xmm5, %eax
|
||||
|
||||
testl %eax, %eax
|
||||
je L(loop64)
|
||||
|
||||
movdqa (%rdi), %xmm5
|
||||
movdqa %xmm5, %xmm0
|
||||
pcmpeqb %xmm1, %xmm5
|
||||
pcmpeqb %xmm6, %xmm0
|
||||
por %xmm0, %xmm5
|
||||
pcmpeqb %xmm6, %xmm2
|
||||
pcmpeqb %xmm6, %xmm3
|
||||
pcmpeqb %xmm6, %xmm4
|
||||
|
||||
pmovmskb %xmm5, %ecx
|
||||
pmovmskb %xmm2, %eax
|
||||
salq $16, %rax
|
||||
pmovmskb %xmm3, %r8d
|
||||
pmovmskb %xmm4, %edx
|
||||
salq $32, %r8
|
||||
orq %r8, %rax
|
||||
orq %rcx, %rax
|
||||
salq $48, %rdx
|
||||
orq %rdx, %rax
|
||||
ALIGN(3)
|
||||
L(return):
|
||||
bsfq %rax, %rax
|
||||
#ifdef AS_STRCHRNUL
|
||||
leaq (%rdi,%rax), %rax
|
||||
#else
|
||||
movl $0, %edx
|
||||
leaq (%rdi,%rax), %rax
|
||||
cmpb %sil, (%rax)
|
||||
cmovne %rdx, %rax
|
||||
#endif
|
||||
ret
|
||||
ALIGN(4)
|
||||
|
||||
L(cross_page):
|
||||
movq %rdi, %rdx
|
||||
pxor %xmm2, %xmm2
|
||||
andq $-64, %rdx
|
||||
movdqa %xmm1, %xmm0
|
||||
movdqa (%rdx), %xmm3
|
||||
movdqa %xmm3, %xmm4
|
||||
pcmpeqb %xmm1, %xmm3
|
||||
pcmpeqb %xmm2, %xmm4
|
||||
por %xmm4, %xmm3
|
||||
pmovmskb %xmm3, %r8d
|
||||
movdqa 16(%rdx), %xmm3
|
||||
movdqa %xmm3, %xmm4
|
||||
pcmpeqb %xmm1, %xmm3
|
||||
pcmpeqb %xmm2, %xmm4
|
||||
por %xmm4, %xmm3
|
||||
pmovmskb %xmm3, %eax
|
||||
movdqa 32(%rdx), %xmm3
|
||||
movdqa %xmm3, %xmm4
|
||||
pcmpeqb %xmm1, %xmm3
|
||||
salq $16, %rax
|
||||
pcmpeqb %xmm2, %xmm4
|
||||
por %xmm4, %xmm3
|
||||
pmovmskb %xmm3, %r9d
|
||||
movdqa 48(%rdx), %xmm3
|
||||
pcmpeqb %xmm3, %xmm2
|
||||
salq $32, %r9
|
||||
pcmpeqb %xmm3, %xmm0
|
||||
orq %r9, %rax
|
||||
orq %r8, %rax
|
||||
por %xmm2, %xmm0
|
||||
pmovmskb %xmm0, %ecx
|
||||
salq $48, %rcx
|
||||
orq %rcx, %rax
|
||||
movl %edi, %ecx
|
||||
subb %dl, %cl
|
||||
shrq %cl, %rax
|
||||
testq %rax, %rax
|
||||
jne L(return)
|
||||
jmp L(loop_start)
|
||||
|
||||
1: bsfl %edx, %edx
|
||||
jz 4f
|
||||
bsfl %ecx, %ecx
|
||||
leaq -16(%rdi,%rdx), %rax
|
||||
cmpl %edx, %ecx
|
||||
je 5f
|
||||
4: xorl %eax, %eax
|
||||
5: ret
|
||||
END (strchr)
|
||||
|
||||
#ifndef AS_STRCHRNUL
|
||||
weak_alias (strchr, index)
|
||||
libc_hidden_builtin_def (strchr)
|
||||
|
||||
#endif
|
||||
|
@ -20,43 +20,8 @@
|
||||
|
||||
#include <sysdep.h>
|
||||
|
||||
|
||||
.text
|
||||
ENTRY (__strchrnul)
|
||||
movd %esi, %xmm1
|
||||
movq %rdi, %rcx
|
||||
punpcklbw %xmm1, %xmm1
|
||||
andq $~15, %rdi
|
||||
pxor %xmm2, %xmm2
|
||||
punpcklbw %xmm1, %xmm1
|
||||
orl $0xffffffff, %esi
|
||||
movdqa (%rdi), %xmm0
|
||||
pshufd $0, %xmm1, %xmm1
|
||||
subq %rdi, %rcx
|
||||
movdqa %xmm0, %xmm3
|
||||
leaq 16(%rdi), %rdi
|
||||
pcmpeqb %xmm1, %xmm0
|
||||
pcmpeqb %xmm2, %xmm3
|
||||
shl %cl, %esi
|
||||
pmovmskb %xmm0, %edx
|
||||
pmovmskb %xmm3, %ecx
|
||||
orl %edx, %ecx
|
||||
andl %esi, %ecx
|
||||
jnz 1f
|
||||
|
||||
2: movdqa (%rdi), %xmm0
|
||||
leaq 16(%rdi), %rdi
|
||||
movdqa %xmm0, %xmm3
|
||||
pcmpeqb %xmm1, %xmm0
|
||||
pcmpeqb %xmm2, %xmm3
|
||||
pmovmskb %xmm0, %edx
|
||||
pmovmskb %xmm3, %ecx
|
||||
orl %edx, %ecx
|
||||
jz 2b
|
||||
|
||||
1: bsfl %ecx, %edx
|
||||
leaq -16(%rdi,%rdx), %rax
|
||||
ret
|
||||
END (__strchrnul)
|
||||
#define strchr __strchrnul
|
||||
#define AS_STRCHRNUL
|
||||
#include "strchr.S"
|
||||
|
||||
weak_alias (__strchrnul, strchrnul)
|
||||
|
Loading…
Reference in New Issue
Block a user