Optimize x86-64 strlen for SSE4.2.

The SSE4.2 implementation is used in the DSO only.  The patch also adds
some infrastructure to be used in similar code later one.
This commit is contained in:
Ulrich Drepper 2009-06-05 11:32:00 -07:00
parent 443caceb35
commit 3ab2d57a4d
5 changed files with 110 additions and 1 deletions

View File

@ -1,5 +1,11 @@
2009-06-05 Ulrich Drepper <drepper@redhat.com>
* sysdeps/x86_64/multiarch/strlen.S: New file.
* sysdeps/x86_64/multiarch/ifunc-defines.sym: New file.
* sysdeps/x86_64/multiarch/Makefile: Add rule to build ifunc-defines.h.
* sysdeps/x86_64/multiarch/init-arch.h: Name structure with register
content.
* csu/elf-init.c: Only compile in IFUNC functionality if USE_MULTIARCH
is defined.

View File

@ -1,3 +1,4 @@
ifeq ($(subdir),csu)
aux += init-arch
gen-as-const-headers += ifunc-defines.sym
endif

View File

@ -0,0 +1,15 @@
#include "init-arch.h"
#include <stddef.h>
--
CPU_FEATURES_SIZE sizeof (struct cpu_features)
KIND_OFFSET offsetof (struct cpu_features, kind)
CPUID_OFFSET offsetof (struct cpu_features, cpuid)
CPUID_SIZE sizeof (struct cpuid_registers)
CPUID_EAX_OFFSET offsetof (struct cpuid_registers, eax)
CPUID_EBX_OFFSET offsetof (struct cpuid_registers, ebx)
CPUID_ECX_OFFSET offsetof (struct cpuid_registers, ecx)
CPUID_EDX_OFFSET offsetof (struct cpuid_registers, edx)
COMMON_CPUID_INDEX_1

View File

@ -35,7 +35,7 @@ extern struct cpu_features
arch_kind_other
} kind;
int max_cpuid;
struct
struct cpuid_registers
{
unsigned int eax;
unsigned int ebx;

View File

@ -0,0 +1,87 @@
/* strlen(str) -- determine the length of the string STR.
Copyright (C) 2009 Free Software Foundation, Inc.
Contributed by Ulrich Drepper <drepper@redhat.com>.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, write to the Free
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */
#include <sysdep.h>
#include <ifunc-defines.h>
/* Define multiple versions only for the definition in libc and for
the DSO. In static binaries we need strlen before the initialization
happened. */
#if defined SHARED && !defined NOT_IN_libc
.text
ENTRY(strlen)
.type strlen, @gnu_indirect_function
cmpl $0, __cpu_features+KIND_OFFSET(%rip)
jne 1f
call __init_cpu_features
1: leaq __strlen_sse2(%rip), %rax
testl $(1<<20), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip)
jz 2f
leaq __strlen_sse42(%rip), %rax
2: ret
END(strlen)
.type __strlen_sse42, @function
__strlen_sse42:
pxor %xmm2, %xmm2
movq %rdi, %rcx
movq %rdi, %r8
andq $~15, %rdi
movdqa %xmm2, %xmm1
pcmpeqb (%rdi), %xmm2
orl $0xffffffff, %esi
subq %rdi, %rcx
shll %cl, %esi
pmovmskb %xmm2, %edx
andl %esi, %edx
jnz 1f
2: pcmpistri $0x08, 16(%rdi), %xmm1
leaq 16(%rdi), %rdi
jnz 2b
leaq (%rdi,%rcx), %rax
subq %r8, %rax
ret
1: bsfl %edx, %eax
leaq (%rdi,%rax), %rax
subq %r8, %rax
ret
.size __strlen_sse42, .-__strlen_sse42
# undef ENTRY
# define ENTRY(name) \
.type __strlen_sse2, @function; __strlen_sse2:
# undef END
# define END(name) \
.size __strlen_sse2, .-__strlen_sse2
# undef libc_hidden_builtin_def
/* It doesn't make sense to send libc-internal strlen calls through a PLT.
The speedup we get from using SSE4.2 instruction is likely eaten away
by the indirect call in the PLT. */
# define libc_hidden_builtin_def(name) \
.globl __GI_strlen; __GI_strlen = __strlen_sse2
#endif
#include "../strlen.S"