mirror of
git://sourceware.org/git/glibc.git
synced 2025-03-19 13:40:59 +08:00
[AArch64] Adjust writeback in non-zero memset
This fixes an ineffiency in the non-zero memset. Delaying the writeback until the end of the loop is slightly faster on some cores - this shows ~5% performance gain on Cortex-A53 when doing large non-zero memsets. * sysdeps/aarch64/memset.S (MEMSET): Improve non-zero memset loop.
This commit is contained in:
parent
9a62a9397d
commit
5770c0ad1e
@ -1,3 +1,7 @@
|
||||
2018-11-20 Wilco Dijkstra <wdijkstr@arm.com>
|
||||
|
||||
* sysdeps/aarch64/memset.S (MEMSET): Improve non-zero memset loop.
|
||||
|
||||
2018-11-20 Joseph Myers <joseph@codesourcery.com>
|
||||
|
||||
* conform/conformtest.py (ElementTest.run): Use unique identifiers
|
||||
|
@ -89,10 +89,10 @@ L(set_long):
|
||||
b.eq L(try_zva)
|
||||
L(no_zva):
|
||||
sub count, dstend, dst /* Count is 16 too large. */
|
||||
add dst, dst, 16
|
||||
sub dst, dst, 16 /* Dst is biased by -32. */
|
||||
sub count, count, 64 + 16 /* Adjust count and bias for loop. */
|
||||
1: stp q0, q0, [dst], 64
|
||||
stp q0, q0, [dst, -32]
|
||||
1: stp q0, q0, [dst, 32]
|
||||
stp q0, q0, [dst, 64]!
|
||||
L(tail64):
|
||||
subs count, count, 64
|
||||
b.hi 1b
|
||||
@ -183,6 +183,7 @@ L(zva_other):
|
||||
subs count, count, zva_len
|
||||
b.hs 3b
|
||||
4: add count, count, zva_len
|
||||
sub dst, dst, 32 /* Bias dst for tail loop. */
|
||||
b L(tail64)
|
||||
#endif
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user