aarch64: Optimize __libc_mtag_tag_zero_region

This is a target hook for memory tagging, the original was a naive
implementation. Uses the same algorithm as __libc_mtag_tag_region,
but with instructions that also zero the memory.  This was not
benchmarked on real cpu, but expected to be faster than the naive
implementation.
This commit is contained in:
Szabolcs Nagy 2021-02-09 17:59:11 +00:00
parent 23fd760add
commit 1dc17ea8f8

View File

@ -20,30 +20,94 @@
#ifdef USE_MTAG
/* Assumptions:
*
* ARMv8-a, AArch64, MTE, LP64 ABI.
*
* Interface contract:
* Address is 16 byte aligned and size is multiple of 16.
* Returns the passed pointer.
* The memory region may remain untagged if tagging is not enabled.
*/
.arch armv8.5-a
.arch_extension memtag
/* NB, only supported on variants with 64-bit pointers. */
#define dstin x0
#define count x1
#define dst x2
#define dstend x3
#define tmp x4
#define zva_val x4
/* FIXME: This is a minimal implementation. We could do much better than
this for large values of COUNT. */
ENTRY (__libc_mtag_tag_zero_region)
PTR_ARG (0)
SIZE_ARG (1)
#define dstin x0
#define count x1
#define dst x2
add dstend, dstin, count
ENTRY(__libc_mtag_tag_zero_region)
cmp count, 96
b.hi L(set_long)
mov dst, dstin
L(loop):
stzg dst, [dst], #16
subs count, count, 16
bne L(loop)
#if 0
/* This is not currently needed, since for now we are only called
to tag memory that is taggable. */
ldg dstin, [dstin] // Recover the tag created (might be untagged).
#endif
tbnz count, 6, L(set96)
/* Set 0, 16, 32, or 48 bytes. */
lsr tmp, count, 5
add tmp, dstin, tmp, lsl 4
cbz count, L(end)
stzg dstin, [dstin]
stzg dstin, [tmp]
stzg dstin, [dstend, -16]
L(end):
ret
.p2align 4
/* Set 64..96 bytes. Write 64 bytes from the start and
32 bytes from the end. */
L(set96):
stz2g dstin, [dstin]
stz2g dstin, [dstin, 32]
stz2g dstin, [dstend, -32]
ret
.p2align 4
/* Size is > 96 bytes. */
L(set_long):
cmp count, 160
b.lo L(no_zva)
#ifndef SKIP_ZVA_CHECK
mrs zva_val, dczid_el0
and zva_val, zva_val, 31
cmp zva_val, 4 /* ZVA size is 64 bytes. */
b.ne L(no_zva)
#endif
stz2g dstin, [dstin]
stz2g dstin, [dstin, 32]
bic dst, dstin, 63
sub count, dstend, dst /* Count is now 64 too large. */
sub count, count, 128 /* Adjust count and bias for loop. */
.p2align 4
L(zva_loop):
add dst, dst, 64
dc gzva, dst
subs count, count, 64
b.hi L(zva_loop)
stz2g dstin, [dstend, -64]
stz2g dstin, [dstend, -32]
ret
L(no_zva):
sub dst, dstin, 32 /* Dst is biased by -32. */
sub count, count, 64 /* Adjust count for loop. */
L(no_zva_loop):
stz2g dstin, [dst, 32]
stz2g dstin, [dst, 64]!
subs count, count, 64
b.hi L(no_zva_loop)
stz2g dstin, [dstend, -64]
stz2g dstin, [dstend, -32]
ret
END (__libc_mtag_tag_zero_region)
#endif /* USE_MTAG */