AArch64: Optimize strcpy

Unroll the main loop.  Large strings are around 20% faster on modern CPUs.

Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
This commit is contained in:
Wilco Dijkstra 2023-01-11 13:52:39 +00:00
parent 09ebd8549b
commit 349e48c01e

View File

@ -30,7 +30,6 @@
* MTE compatible.
*/
/* Arguments and results. */
#define dstin x0
#define srcin x1
#define result x0
@ -76,14 +75,14 @@ ENTRY (STRCPY)
ld1 {vdata.16b}, [src]
cmeq vhas_nul.16b, vdata.16b, 0
lsl shift, srcin, 2
shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
shrn vend.8b, vhas_nul.8h, 4
fmov synd, dend
lsr synd, synd, shift
cbnz synd, L(tail)
ldr dataq, [src, 16]!
cmeq vhas_nul.16b, vdata.16b, 0
shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
shrn vend.8b, vhas_nul.8h, 4
fmov synd, dend
cbz synd, L(start_loop)
@ -102,13 +101,10 @@ ENTRY (STRCPY)
IFSTPCPY (add result, dstin, len)
ret
.p2align 4,,8
L(tail):
rbit synd, synd
clz len, synd
lsr len, len, 2
.p2align 4
L(less16):
tbz len, 3, L(less8)
sub tmp, len, 7
@ -141,31 +137,37 @@ L(zerobyte):
.p2align 4
L(start_loop):
sub len, src, srcin
sub tmp, srcin, dstin
ldr dataq2, [srcin]
add dst, dstin, len
sub dst, src, tmp
str dataq2, [dstin]
.p2align 5
L(loop):
str dataq, [dst], 16
ldr dataq, [src, 16]!
str dataq, [dst], 32
ldr dataq, [src, 16]
cmeq vhas_nul.16b, vdata.16b, 0
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
fmov synd, dend
cbnz synd, L(loopend)
str dataq, [dst, -16]
ldr dataq, [src, 32]!
cmeq vhas_nul.16b, vdata.16b, 0
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
fmov synd, dend
cbz synd, L(loop)
add dst, dst, 16
L(loopend):
shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
fmov synd, dend
sub dst, dst, 31
#ifndef __AARCH64EB__
rbit synd, synd
#endif
clz len, synd
lsr len, len, 2
sub tmp, len, 15
ldr dataq, [src, tmp]
str dataq, [dst, tmp]
IFSTPCPY (add result, dst, len)
add dst, dst, len
ldr dataq, [dst, tmp]
str dataq, [dst]
IFSTPCPY (add result, dst, 15)
ret
END (STRCPY)