mirror of
git://sourceware.org/git/glibc.git
synced 2024-12-15 04:20:28 +08:00
powerpc: Optimization for strlen for POWER8.
This implementation takes advantage of vectorization to improve performance of the loop over the current strlen implementation for POWER7.
This commit is contained in:
parent
223f17cf7b
commit
1b045ee53e
13
ChangeLog
13
ChangeLog
@ -1,3 +1,16 @@
|
|||||||
|
2016-04-15 Carlos Eduardo Seo <cseo@linux.vnet.ibm.com>
|
||||||
|
|
||||||
|
* sysdeps/powerpc/powerpc64/multiarch/Makefile:
|
||||||
|
(sysdep_routines): Added __strlen_power8.
|
||||||
|
* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c: Added
|
||||||
|
__strlen_power8 entry.
|
||||||
|
* sysdeps/powerpc/powerpc64/multiarch/strlen-power8.S: New file.
|
||||||
|
Implementation for POWER8.
|
||||||
|
* sysdeps/powerpc/powerpc64/multiarch/strlen.c: Added IFUNC selector
|
||||||
|
for __strlen_power8.
|
||||||
|
* sysdeps/powerpc/powerpc64/power8/strlen.S: New file.
|
||||||
|
Implementation for POWER8.
|
||||||
|
|
||||||
2016-04-15 Mike Frysinger <vapier@gentoo.org>
|
2016-04-15 Mike Frysinger <vapier@gentoo.org>
|
||||||
|
|
||||||
* locale/iso-4217.def: Add SSP and change ZMK to ZMW.
|
* locale/iso-4217.def: Add SSP and change ZMK to ZMW.
|
||||||
|
@ -20,7 +20,7 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
|
|||||||
strcat-power8 strcat-power7 strcat-ppc64 \
|
strcat-power8 strcat-power7 strcat-ppc64 \
|
||||||
memmove-power7 memmove-ppc64 wordcopy-ppc64 bcopy-ppc64 \
|
memmove-power7 memmove-ppc64 wordcopy-ppc64 bcopy-ppc64 \
|
||||||
strncpy-power8 strstr-power7 strstr-ppc64 \
|
strncpy-power8 strstr-power7 strstr-ppc64 \
|
||||||
strspn-power8 strspn-ppc64
|
strspn-power8 strspn-ppc64 strlen-power8
|
||||||
|
|
||||||
CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
|
CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
|
||||||
CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
|
CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
|
||||||
|
@ -101,6 +101,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|||||||
|
|
||||||
/* Support sysdeps/powerpc/powerpc64/multiarch/strlen.c. */
|
/* Support sysdeps/powerpc/powerpc64/multiarch/strlen.c. */
|
||||||
IFUNC_IMPL (i, name, strlen,
|
IFUNC_IMPL (i, name, strlen,
|
||||||
|
IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_2_07,
|
||||||
|
__strlen_power8)
|
||||||
IFUNC_IMPL_ADD (array, i, strlen, hwcap & PPC_FEATURE_HAS_VSX,
|
IFUNC_IMPL_ADD (array, i, strlen, hwcap & PPC_FEATURE_HAS_VSX,
|
||||||
__strlen_power7)
|
__strlen_power7)
|
||||||
IFUNC_IMPL_ADD (array, i, strlen, 1,
|
IFUNC_IMPL_ADD (array, i, strlen, 1,
|
||||||
|
39
sysdeps/powerpc/powerpc64/multiarch/strlen-power8.S
Normal file
39
sysdeps/powerpc/powerpc64/multiarch/strlen-power8.S
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
/* Optimized strlen implementation for POWER8.
|
||||||
|
Copyright (C) 2016 Free Software Foundation, Inc.
|
||||||
|
This file is part of the GNU C Library.
|
||||||
|
|
||||||
|
The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
modify it under the terms of the GNU Lesser General Public
|
||||||
|
License as published by the Free Software Foundation; either
|
||||||
|
version 2.1 of the License, or (at your option) any later version.
|
||||||
|
|
||||||
|
The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
Lesser General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU Lesser General Public
|
||||||
|
License along with the GNU C Library; if not, see
|
||||||
|
<http://www.gnu.org/licenses/>. */
|
||||||
|
|
||||||
|
#include <sysdep.h>
|
||||||
|
|
||||||
|
#undef EALIGN
|
||||||
|
#define EALIGN(name, alignt, words) \
|
||||||
|
.section ".text"; \
|
||||||
|
ENTRY_2(__strlen_power8) \
|
||||||
|
.align ALIGNARG(alignt); \
|
||||||
|
EALIGN_W_##words; \
|
||||||
|
BODY_LABEL(__strlen_power8): \
|
||||||
|
cfi_startproc; \
|
||||||
|
LOCALENTRY(__strlen_power8)
|
||||||
|
#undef END
|
||||||
|
#define END(name) \
|
||||||
|
cfi_endproc; \
|
||||||
|
TRACEBACK(__strlen_power8) \
|
||||||
|
END_2(__strlen_power8)
|
||||||
|
|
||||||
|
#undef libc_hidden_builtin_def
|
||||||
|
#define libc_hidden_builtin_def(name)
|
||||||
|
|
||||||
|
#include <sysdeps/powerpc/powerpc64/power8/strlen.S>
|
@ -29,8 +29,11 @@ extern __typeof (__redirect_strlen) __libc_strlen;
|
|||||||
|
|
||||||
extern __typeof (__redirect_strlen) __strlen_ppc attribute_hidden;
|
extern __typeof (__redirect_strlen) __strlen_ppc attribute_hidden;
|
||||||
extern __typeof (__redirect_strlen) __strlen_power7 attribute_hidden;
|
extern __typeof (__redirect_strlen) __strlen_power7 attribute_hidden;
|
||||||
|
extern __typeof (__redirect_strlen) __strlen_power8 attribute_hidden;
|
||||||
|
|
||||||
libc_ifunc (__libc_strlen,
|
libc_ifunc (__libc_strlen,
|
||||||
|
(hwcap2 & PPC_FEATURE2_ARCH_2_07)
|
||||||
|
? __strlen_power8 :
|
||||||
(hwcap & PPC_FEATURE_HAS_VSX)
|
(hwcap & PPC_FEATURE_HAS_VSX)
|
||||||
? __strlen_power7
|
? __strlen_power7
|
||||||
: __strlen_ppc);
|
: __strlen_ppc);
|
||||||
|
297
sysdeps/powerpc/powerpc64/power8/strlen.S
Normal file
297
sysdeps/powerpc/powerpc64/power8/strlen.S
Normal file
@ -0,0 +1,297 @@
|
|||||||
|
/* Optimized strlen implementation for PowerPC64/POWER8 using a vectorized
|
||||||
|
loop.
|
||||||
|
Copyright (C) 2016 Free Software Foundation, Inc.
|
||||||
|
This file is part of the GNU C Library.
|
||||||
|
|
||||||
|
The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
modify it under the terms of the GNU Lesser General Public
|
||||||
|
License as published by the Free Software Foundation; either
|
||||||
|
version 2.1 of the License, or (at your option) any later version.
|
||||||
|
|
||||||
|
The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
Lesser General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU Lesser General Public
|
||||||
|
License along with the GNU C Library; if not, see
|
||||||
|
<http://www.gnu.org/licenses/>. */
|
||||||
|
|
||||||
|
#include <sysdep.h>
|
||||||
|
|
||||||
|
/* TODO: change these to the actual instructions when the minimum required
|
||||||
|
binutils allows it. */
|
||||||
|
#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
|
||||||
|
#define VBPERMQ(t,a,b) .long (0x1000054c \
|
||||||
|
| ((t)<<(32-11)) \
|
||||||
|
| ((a)<<(32-16)) \
|
||||||
|
| ((b)<<(32-21)) )
|
||||||
|
|
||||||
|
/* int [r3] strlen (char *s [r3]) */
|
||||||
|
|
||||||
|
/* TODO: change this to .machine power8 when the minimum required binutils
|
||||||
|
allows it. */
|
||||||
|
.machine power7
|
||||||
|
EALIGN (strlen, 4, 0)
|
||||||
|
CALL_MCOUNT 1
|
||||||
|
dcbt 0,r3
|
||||||
|
clrrdi r4,r3,3 /* Align the address to doubleword boundary. */
|
||||||
|
rlwinm r6,r3,3,26,28 /* Calculate padding. */
|
||||||
|
li r0,0 /* Doubleword with null chars to use
|
||||||
|
with cmpb. */
|
||||||
|
li r5,-1 /* MASK = 0xffffffffffffffff. */
|
||||||
|
ld r12,0(r4) /* Load doubleword from memory. */
|
||||||
|
#ifdef __LITTLE_ENDIAN__
|
||||||
|
sld r5,r5,r6
|
||||||
|
#else
|
||||||
|
srd r5,r5,r6 /* MASK = MASK >> padding. */
|
||||||
|
#endif
|
||||||
|
orc r9,r12,r5 /* Mask bits that are not part of the string. */
|
||||||
|
cmpb r10,r9,r0 /* Check for null bytes in DWORD1. */
|
||||||
|
cmpdi cr7,r10,0 /* If r10 == 0, no null's have been found. */
|
||||||
|
bne cr7,L(done)
|
||||||
|
|
||||||
|
/* For shorter strings (< 64 bytes), we will not use vector registers,
|
||||||
|
as the overhead isn't worth it. So, let's use GPRs instead. This
|
||||||
|
will be done the same way as we do in the POWER7 implementation.
|
||||||
|
Let's see if we are aligned to a quadword boundary. If so, we can
|
||||||
|
jump to the first (non-vectorized) loop. Otherwise, we have to
|
||||||
|
handle the next DWORD first. */
|
||||||
|
mtcrf 0x01,r4
|
||||||
|
mr r9,r4
|
||||||
|
addi r9,r9,8
|
||||||
|
bt 28,L(align64)
|
||||||
|
|
||||||
|
/* Handle the next 8 bytes so we are aligned to a quadword
|
||||||
|
boundary. */
|
||||||
|
ldu r5,8(r4)
|
||||||
|
cmpb r10,r5,r0
|
||||||
|
cmpdi cr7,r10,0
|
||||||
|
addi r9,r9,8
|
||||||
|
bne cr7,L(done)
|
||||||
|
|
||||||
|
L(align64):
|
||||||
|
/* Proceed to the old (POWER7) implementation, checking two doublewords
|
||||||
|
per iteraction. For the first 56 bytes, we will just check for null
|
||||||
|
characters. After that, we will also check if we are 64-byte aligned
|
||||||
|
so we can jump to the vectorized implementation. We will unroll
|
||||||
|
these loops to avoid excessive branching. */
|
||||||
|
ld r6,8(r4)
|
||||||
|
ldu r5,16(r4)
|
||||||
|
cmpb r10,r6,r0
|
||||||
|
cmpb r11,r5,r0
|
||||||
|
or r5,r10,r11
|
||||||
|
cmpdi cr7,r5,0
|
||||||
|
addi r9,r9,16
|
||||||
|
bne cr7,L(dword_zero)
|
||||||
|
|
||||||
|
ld r6,8(r4)
|
||||||
|
ldu r5,16(r4)
|
||||||
|
cmpb r10,r6,r0
|
||||||
|
cmpb r11,r5,r0
|
||||||
|
or r5,r10,r11
|
||||||
|
cmpdi cr7,r5,0
|
||||||
|
addi r9,r9,16
|
||||||
|
bne cr7,L(dword_zero)
|
||||||
|
|
||||||
|
ld r6,8(r4)
|
||||||
|
ldu r5,16(r4)
|
||||||
|
cmpb r10,r6,r0
|
||||||
|
cmpb r11,r5,r0
|
||||||
|
or r5,r10,r11
|
||||||
|
cmpdi cr7,r5,0
|
||||||
|
addi r9,r9,16
|
||||||
|
bne cr7,L(dword_zero)
|
||||||
|
|
||||||
|
/* Are we 64-byte aligned? If so, jump to the vectorized loop.
|
||||||
|
Note: aligning to 64-byte will necessarily slow down performance for
|
||||||
|
strings around 64 bytes in length due to the extra comparisons
|
||||||
|
required to check alignment for the vectorized loop. This is a
|
||||||
|
necessary tradeoff we are willing to take in order to speed up the
|
||||||
|
calculation for larger strings. */
|
||||||
|
andi. r10,r9,63
|
||||||
|
beq cr0,L(preloop)
|
||||||
|
ld r6,8(r4)
|
||||||
|
ldu r5,16(r4)
|
||||||
|
cmpb r10,r6,r0
|
||||||
|
cmpb r11,r5,r0
|
||||||
|
or r5,r10,r11
|
||||||
|
cmpdi cr7,r5,0
|
||||||
|
addi r9,r9,16
|
||||||
|
bne cr7,L(dword_zero)
|
||||||
|
|
||||||
|
andi. r10,r9,63
|
||||||
|
beq cr0,L(preloop)
|
||||||
|
ld r6,8(r4)
|
||||||
|
ldu r5,16(r4)
|
||||||
|
cmpb r10,r6,r0
|
||||||
|
cmpb r11,r5,r0
|
||||||
|
or r5,r10,r11
|
||||||
|
cmpdi cr7,r5,0
|
||||||
|
addi r9,r9,16
|
||||||
|
bne cr7,L(dword_zero)
|
||||||
|
|
||||||
|
andi. r10,r9,63
|
||||||
|
beq cr0,L(preloop)
|
||||||
|
ld r6,8(r4)
|
||||||
|
ldu r5,16(r4)
|
||||||
|
cmpb r10,r6,r0
|
||||||
|
cmpb r11,r5,r0
|
||||||
|
or r5,r10,r11
|
||||||
|
cmpdi cr7,r5,0
|
||||||
|
addi r9,r9,16
|
||||||
|
bne cr7,L(dword_zero)
|
||||||
|
|
||||||
|
andi. r10,r9,63
|
||||||
|
beq cr0,L(preloop)
|
||||||
|
ld r6,8(r4)
|
||||||
|
ldu r5,16(r4)
|
||||||
|
cmpb r10,r6,r0
|
||||||
|
cmpb r11,r5,r0
|
||||||
|
or r5,r10,r11
|
||||||
|
cmpdi cr7,r5,0
|
||||||
|
addi r9,r9,16
|
||||||
|
|
||||||
|
/* At this point, we are necessarily 64-byte aligned. If no zeroes were
|
||||||
|
found, jump to the vectorized loop. */
|
||||||
|
beq cr7,L(preloop)
|
||||||
|
|
||||||
|
L(dword_zero):
|
||||||
|
/* OK, one (or both) of the doublewords contains a null byte. Check
|
||||||
|
the first doubleword and decrement the address in case the first
|
||||||
|
doubleword really contains a null byte. */
|
||||||
|
|
||||||
|
cmpdi cr6,r10,0
|
||||||
|
addi r4,r4,-8
|
||||||
|
bne cr6,L(done)
|
||||||
|
|
||||||
|
/* The null byte must be in the second doubleword. Adjust the address
|
||||||
|
again and move the result of cmpb to r10 so we can calculate the
|
||||||
|
length. */
|
||||||
|
|
||||||
|
mr r10,r11
|
||||||
|
addi r4,r4,8
|
||||||
|
|
||||||
|
/* If the null byte was found in the non-vectorized code, compute the
|
||||||
|
final length. r10 has the output of the cmpb instruction, that is,
|
||||||
|
it contains 0xff in the same position as the null byte in the
|
||||||
|
original doubleword from the string. Use that to calculate the
|
||||||
|
length. */
|
||||||
|
L(done):
|
||||||
|
#ifdef __LITTLE_ENDIAN__
|
||||||
|
addi r9, r10,-1 /* Form a mask from trailing zeros. */
|
||||||
|
andc r9, r9,r10
|
||||||
|
popcntd r0, r9 /* Count the bits in the mask. */
|
||||||
|
#else
|
||||||
|
cntlzd r0,r10 /* Count leading zeros before the match. */
|
||||||
|
#endif
|
||||||
|
subf r5,r3,r4
|
||||||
|
srdi r0,r0,3 /* Convert leading/trailing zeros to bytes. */
|
||||||
|
add r3,r5,r0 /* Compute final length. */
|
||||||
|
blr
|
||||||
|
|
||||||
|
/* Vectorized implementation starts here. */
|
||||||
|
.p2align 4
|
||||||
|
L(preloop):
|
||||||
|
/* Set up for the loop. */
|
||||||
|
mr r4,r9
|
||||||
|
li r7, 16 /* Load required offsets. */
|
||||||
|
li r8, 32
|
||||||
|
li r9, 48
|
||||||
|
li r12, 8
|
||||||
|
vxor v0,v0,v0 /* VR with null chars to use with
|
||||||
|
vcmpequb. */
|
||||||
|
|
||||||
|
/* Main loop to look for the end of the string. We will read in
|
||||||
|
64-byte chunks. Align it to 32 bytes and unroll it 3 times to
|
||||||
|
leverage the icache performance. */
|
||||||
|
.p2align 5
|
||||||
|
L(loop):
|
||||||
|
lvx v1,r4,r0 /* Load 4 quadwords. */
|
||||||
|
lvx v2,r4,r7
|
||||||
|
lvx v3,r4,r8
|
||||||
|
lvx v4,r4,r9
|
||||||
|
vminub v5,v1,v2 /* Compare and merge into one VR for speed. */
|
||||||
|
vminub v6,v3,v4
|
||||||
|
vminub v7,v5,v6
|
||||||
|
vcmpequb. v7,v7,v0 /* Check for NULLs. */
|
||||||
|
addi r4,r4,64 /* Adjust address for the next iteration. */
|
||||||
|
bne cr6,L(vmx_zero)
|
||||||
|
|
||||||
|
lvx v1,r4,r0 /* Load 4 quadwords. */
|
||||||
|
lvx v2,r4,r7
|
||||||
|
lvx v3,r4,r8
|
||||||
|
lvx v4,r4,r9
|
||||||
|
vminub v5,v1,v2 /* Compare and merge into one VR for speed. */
|
||||||
|
vminub v6,v3,v4
|
||||||
|
vminub v7,v5,v6
|
||||||
|
vcmpequb. v7,v7,v0 /* Check for NULLs. */
|
||||||
|
addi r4,r4,64 /* Adjust address for the next iteration. */
|
||||||
|
bne cr6,L(vmx_zero)
|
||||||
|
|
||||||
|
lvx v1,r4,r0 /* Load 4 quadwords. */
|
||||||
|
lvx v2,r4,r7
|
||||||
|
lvx v3,r4,r8
|
||||||
|
lvx v4,r4,r9
|
||||||
|
vminub v5,v1,v2 /* Compare and merge into one VR for speed. */
|
||||||
|
vminub v6,v3,v4
|
||||||
|
vminub v7,v5,v6
|
||||||
|
vcmpequb. v7,v7,v0 /* Check for NULLs. */
|
||||||
|
addi r4,r4,64 /* Adjust address for the next iteration. */
|
||||||
|
beq cr6,L(loop)
|
||||||
|
|
||||||
|
L(vmx_zero):
|
||||||
|
/* OK, we found a null byte. Let's look for it in the current 64-byte
|
||||||
|
block and mark it in its corresponding VR. */
|
||||||
|
vcmpequb v1,v1,v0
|
||||||
|
vcmpequb v2,v2,v0
|
||||||
|
vcmpequb v3,v3,v0
|
||||||
|
vcmpequb v4,v4,v0
|
||||||
|
|
||||||
|
/* We will now 'compress' the result into a single doubleword, so it
|
||||||
|
can be moved to a GPR for the final calculation. First, we
|
||||||
|
generate an appropriate mask for vbpermq, so we can permute bits into
|
||||||
|
the first halfword. */
|
||||||
|
vspltisb v10,3
|
||||||
|
lvsl v11,r0,r0
|
||||||
|
vslb v10,v11,v10
|
||||||
|
|
||||||
|
/* Permute the first bit of each byte into bits 48-63. */
|
||||||
|
VBPERMQ(v1,v1,v10)
|
||||||
|
VBPERMQ(v2,v2,v10)
|
||||||
|
VBPERMQ(v3,v3,v10)
|
||||||
|
VBPERMQ(v4,v4,v10)
|
||||||
|
|
||||||
|
/* Shift each component into its correct position for merging. */
|
||||||
|
#ifdef __LITTLE_ENDIAN__
|
||||||
|
vsldoi v2,v2,v2,2
|
||||||
|
vsldoi v3,v3,v3,4
|
||||||
|
vsldoi v4,v4,v4,6
|
||||||
|
#else
|
||||||
|
vsldoi v1,v1,v1,6
|
||||||
|
vsldoi v2,v2,v2,4
|
||||||
|
vsldoi v3,v3,v3,2
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* Merge the results and move to a GPR. */
|
||||||
|
vor v1,v2,v1
|
||||||
|
vor v2,v3,v4
|
||||||
|
vor v4,v1,v2
|
||||||
|
MFVRD(r10,v4)
|
||||||
|
|
||||||
|
/* Adjust address to the begninning of the current 64-byte block. */
|
||||||
|
addi r4,r4,-64
|
||||||
|
|
||||||
|
#ifdef __LITTLE_ENDIAN__
|
||||||
|
addi r9, r10,-1 /* Form a mask from trailing zeros. */
|
||||||
|
andc r9, r9,r10
|
||||||
|
popcntd r0, r9 /* Count the bits in the mask. */
|
||||||
|
#else
|
||||||
|
cntlzd r0,r10 /* Count leading zeros before the match. */
|
||||||
|
#endif
|
||||||
|
subf r5,r3,r4
|
||||||
|
add r3,r5,r0 /* Compute final length. */
|
||||||
|
blr
|
||||||
|
|
||||||
|
END (strlen)
|
||||||
|
libc_hidden_builtin_def (strlen)
|
Loading…
Reference in New Issue
Block a user