libatomic: Enable LSE128 128-bit atomics for Armv9.4-a

The armv9.4-a architectural revision adds three new atomic operations
associated with the LSE128 feature:

  * LDCLRP - Atomic AND NOT (bitclear) of a location with 128-bit
  value held in a pair of registers, with original data loaded into
  the same 2 registers.
  * LDSETP - Atomic OR (bitset) of a location with 128-bit value held
  in a pair of registers, with original data loaded into the same 2
  registers.
  * SWPP - Atomic swap of one 128-bit value with 128-bit value held
  in a pair of registers.

It is worth noting that in keeping with existing 128-bit atomic
operations in `atomic_16.S', we have chosen to merge certain
less-restrictive orderings into more restrictive ones.  This is done
to minimize the number of branches in the atomic functions, minimizing
both the likelihood of branch mispredictions and, in keeping code
small, limit the need for extra fetch cycles.

Past benchmarking has revealed that acquire is typically slightly
faster than release (5-10%), such that for the most frequently used
atomics (CAS and SWP) it makes sense to add support for acquire, as
well as release.

Likewise, it was identified that combining acquire and release typically
results in little to no penalty, such that it is of negligible benefit
to distinguish between release and acquire-release, making the
combining release/acq_rel/seq_cst a worthwhile design choice.

This patch adds the logic required to make use of these when the
architectural feature is present and a suitable assembler available.

In order to do this, the following changes are made:

  1. Add a configure-time check to check for LSE128 support in the
  assembler.
  2. Edit host-config.h so that when N == 16, nifunc = 2.
  3. Where available due to LSE128, implement the second ifunc, making
  use of the novel instructions.
  4. For atomic functions unable to make use of these new
  instructions, define a new alias which causes the _i1 function
  variant to point ahead to the corresponding _i2 implementation.

libatomic/ChangeLog:

	* Makefile.am (AM_CPPFLAGS): add conditional setting of
	-DHAVE_FEAT_LSE128.
	* acinclude.m4 (LIBAT_TEST_FEAT_AARCH64_LSE128): New.
	* config/linux/aarch64/atomic_16.S (LSE128): New macro
	definition.
	(libat_exchange_16): New LSE128 variant.
	(libat_fetch_or_16): Likewise.
	(libat_or_fetch_16): Likewise.
	(libat_fetch_and_16): Likewise.
	(libat_and_fetch_16): Likewise.
	* config/linux/aarch64/host-config.h (IFUNC_COND_2): New.
	(IFUNC_NCOND): Add operand size checking.
	(has_lse2): Renamed from `ifunc1`.
	(has_lse128): New.
	(HWCAP2_LSE128): Likewise.
	* configure.ac: Add call to
	LIBAT_TEST_FEAT_AARCH64_LSE128.
	* configure (ac_subst_vars): Regenerated via autoreconf.
	* Makefile.in: Likewise.
	* auto-config.h.in: Likewise.
This commit is contained in:
Victor Do Nascimento 2023-08-01 11:07:56 +01:00
parent a899a1f2f3
commit 5ad64d76c0
8 changed files with 293 additions and 9 deletions

View File

@ -130,6 +130,9 @@ libatomic_la_LIBADD = $(foreach s,$(SIZES),$(addsuffix _$(s)_.lo,$(SIZEOBJS)))
## On a target-specific basis, include alternates to be selected by IFUNC.
if HAVE_IFUNC
if ARCH_AARCH64_LINUX
if ARCH_AARCH64_HAVE_LSE128
AM_CPPFLAGS = -DHAVE_FEAT_LSE128
endif
IFUNC_OPTIONS = -march=armv8-a+lse
libatomic_la_LIBADD += $(foreach s,$(SIZES),$(addsuffix _$(s)_1_.lo,$(SIZEOBJS)))
libatomic_la_SOURCES += atomic_16.S

View File

@ -452,6 +452,7 @@ M_SRC = $(firstword $(filter %/$(M_FILE), $(all_c_files)))
libatomic_la_LIBADD = $(foreach s,$(SIZES),$(addsuffix \
_$(s)_.lo,$(SIZEOBJS))) $(am__append_1) $(am__append_3) \
$(am__append_4) $(am__append_5)
@ARCH_AARCH64_HAVE_LSE128_TRUE@@ARCH_AARCH64_LINUX_TRUE@@HAVE_IFUNC_TRUE@AM_CPPFLAGS = -DHAVE_FEAT_LSE128
@ARCH_AARCH64_LINUX_TRUE@@HAVE_IFUNC_TRUE@IFUNC_OPTIONS = -march=armv8-a+lse
@ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@IFUNC_OPTIONS = -march=armv7-a+fp -DHAVE_KERNEL64
@ARCH_I386_TRUE@@HAVE_IFUNC_TRUE@IFUNC_OPTIONS = -march=i586

View File

@ -83,6 +83,25 @@ AC_DEFUN([LIBAT_TEST_ATOMIC_BUILTIN],[
])
])
dnl
dnl Test if the host assembler supports armv9.4-a LSE128 isns.
dnl
AC_DEFUN([LIBAT_TEST_FEAT_AARCH64_LSE128],[
AC_CACHE_CHECK([for armv9.4-a LSE128 insn support],
[libat_cv_have_feat_lse128],[
AC_LANG_CONFTEST([AC_LANG_PROGRAM([],[asm(".arch armv9-a+lse128")])])
if AC_TRY_EVAL(ac_compile); then
eval libat_cv_have_feat_lse128=yes
else
eval libat_cv_have_feat_lse128=no
fi
rm -f conftest*
])
LIBAT_DEFINE_YESNO([HAVE_FEAT_LSE128], [$libat_cv_have_feat_lse128],
[Have LSE128 support for 16 byte integers.])
AM_CONDITIONAL([ARCH_AARCH64_HAVE_LSE128], [test x$libat_cv_have_feat_lse128 = xyes])
])
dnl
dnl Test if we have __atomic_load and __atomic_store for mode $1, size $2
dnl

View File

@ -105,6 +105,9 @@
/* Define to 1 if you have the <dlfcn.h> header file. */
#undef HAVE_DLFCN_H
/* Have LSE128 support for 16 byte integers. */
#undef HAVE_FEAT_LSE128
/* Define to 1 if you have the <fenv.h> header file. */
#undef HAVE_FENV_H

View File

@ -35,12 +35,17 @@
writes, this will be true when using atomics in actual code.
The libat_<op>_16 entry points are ARMv8.0.
The libat_<op>_16_i1 entry points are used when LSE2 is available. */
The libat_<op>_16_i1 entry points are used when LSE128 is available.
The libat_<op>_16_i2 entry points are used when LSE2 is available. */
#if HAVE_FEAT_LSE128
.arch armv9-a+lse128
#else
.arch armv8-a+lse
#endif
#define LSE2(NAME) NAME##_i1
#define LSE128(NAME) NAME##_i1
#define LSE2(NAME) NAME##_i2
#define CORE(NAME) NAME
#define ENTRY_FEAT(NAME, FEAT) \
@ -202,6 +207,31 @@ ENTRY (libat_exchange_16)
END (libat_exchange_16)
#if HAVE_FEAT_LSE128
ENTRY_FEAT (libat_exchange_16, LSE128)
mov tmp0, x0
mov res0, in0
mov res1, in1
cbnz w4, 1f
/* RELAXED. */
swpp res0, res1, [tmp0]
ret
1:
cmp w4, ACQUIRE
b.hi 2f
/* ACQUIRE/CONSUME. */
swppa res0, res1, [tmp0]
ret
/* RELEASE/ACQ_REL/SEQ_CST. */
2: swppal res0, res1, [tmp0]
ret
END_FEAT (libat_exchange_16, LSE128)
#endif
ENTRY (libat_compare_exchange_16)
ldp exp0, exp1, [x1]
cbz w4, 3f
@ -395,6 +425,31 @@ ENTRY (libat_fetch_or_16)
END (libat_fetch_or_16)
#if HAVE_FEAT_LSE128
ENTRY_FEAT (libat_fetch_or_16, LSE128)
mov tmp0, x0
mov res0, in0
mov res1, in1
cbnz w4, 1f
/* RELAXED. */
ldsetp res0, res1, [tmp0]
ret
1:
cmp w4, ACQUIRE
b.hi 2f
/* ACQUIRE/CONSUME. */
ldsetpa res0, res1, [tmp0]
ret
/* RELEASE/ACQ_REL/SEQ_CST. */
2: ldsetpal res0, res1, [tmp0]
ret
END_FEAT (libat_fetch_or_16, LSE128)
#endif
ENTRY (libat_or_fetch_16)
mov x5, x0
cbnz w4, 2f
@ -417,6 +472,36 @@ ENTRY (libat_or_fetch_16)
END (libat_or_fetch_16)
#if HAVE_FEAT_LSE128
ENTRY_FEAT (libat_or_fetch_16, LSE128)
cbnz w4, 1f
mov tmp0, in0
mov tmp1, in1
/* RELAXED. */
ldsetp in0, in1, [x0]
orr res0, in0, tmp0
orr res1, in1, tmp1
ret
1:
cmp w4, ACQUIRE
b.hi 2f
/* ACQUIRE/CONSUME. */
ldsetpa in0, in1, [x0]
orr res0, in0, tmp0
orr res1, in1, tmp1
ret
/* RELEASE/ACQ_REL/SEQ_CST. */
2: ldsetpal in0, in1, [x0]
orr res0, in0, tmp0
orr res1, in1, tmp1
ret
END_FEAT (libat_or_fetch_16, LSE128)
#endif
ENTRY (libat_fetch_and_16)
mov x5, x0
cbnz w4, 2f
@ -439,6 +524,32 @@ ENTRY (libat_fetch_and_16)
END (libat_fetch_and_16)
#if HAVE_FEAT_LSE128
ENTRY_FEAT (libat_fetch_and_16, LSE128)
mov tmp0, x0
mvn res0, in0
mvn res1, in1
cbnz w4, 1f
/* RELAXED. */
ldclrp res0, res1, [tmp0]
ret
1:
cmp w4, ACQUIRE
b.hi 2f
/* ACQUIRE/CONSUME. */
ldclrpa res0, res1, [tmp0]
ret
/* RELEASE/ACQ_REL/SEQ_CST. */
2: ldclrpal res0, res1, [tmp0]
ret
END_FEAT (libat_fetch_and_16, LSE128)
#endif
ENTRY (libat_and_fetch_16)
mov x5, x0
cbnz w4, 2f
@ -461,6 +572,37 @@ ENTRY (libat_and_fetch_16)
END (libat_and_fetch_16)
#if HAVE_FEAT_LSE128
ENTRY_FEAT (libat_and_fetch_16, LSE128)
mvn tmp0, in0
mvn tmp0, in1
cbnz w4, 1f
/* RELAXED. */
ldclrp tmp0, tmp1, [x0]
and res0, tmp0, in0
and res1, tmp1, in1
ret
1:
cmp w4, ACQUIRE
b.hi 2f
/* ACQUIRE/CONSUME. */
ldclrpa tmp0, tmp1, [x0]
and res0, tmp0, in0
and res1, tmp1, in1
ret
/* RELEASE/ACQ_REL/SEQ_CST. */
2: ldclrpal tmp0, tmp1, [x5]
and res0, tmp0, in0
and res1, tmp1, in1
ret
END_FEAT (libat_and_fetch_16, LSE128)
#endif
ENTRY (libat_fetch_xor_16)
mov x5, x0
cbnz w4, 2f
@ -566,6 +708,28 @@ ENTRY (libat_test_and_set_16)
END (libat_test_and_set_16)
/* Alias entry points which are the same in LSE2 and LSE128. */
#if !HAVE_FEAT_LSE128
ALIAS (libat_exchange_16, LSE128, LSE2)
ALIAS (libat_fetch_or_16, LSE128, LSE2)
ALIAS (libat_fetch_and_16, LSE128, LSE2)
ALIAS (libat_or_fetch_16, LSE128, LSE2)
ALIAS (libat_and_fetch_16, LSE128, LSE2)
#endif
ALIAS (libat_load_16, LSE128, LSE2)
ALIAS (libat_store_16, LSE128, LSE2)
ALIAS (libat_compare_exchange_16, LSE128, LSE2)
ALIAS (libat_fetch_add_16, LSE128, LSE2)
ALIAS (libat_add_fetch_16, LSE128, LSE2)
ALIAS (libat_fetch_sub_16, LSE128, LSE2)
ALIAS (libat_sub_fetch_16, LSE128, LSE2)
ALIAS (libat_fetch_xor_16, LSE128, LSE2)
ALIAS (libat_xor_fetch_16, LSE128, LSE2)
ALIAS (libat_fetch_nand_16, LSE128, LSE2)
ALIAS (libat_nand_fetch_16, LSE128, LSE2)
ALIAS (libat_test_and_set_16, LSE128, LSE2)
/* Alias entry points which are the same in baseline and LSE2. */
ALIAS (libat_exchange_16, LSE2, CORE)

View File

@ -37,14 +37,17 @@ typedef struct __ifunc_arg_t {
#ifdef HWCAP_USCAT
# if N == 16
# define IFUNC_COND_1 ifunc1 (hwcap, features)
# define IFUNC_COND_1 (has_lse128 (hwcap, features))
# define IFUNC_COND_2 (has_lse2 (hwcap, features))
# define IFUNC_NCOND(N) 2
# else
# define IFUNC_COND_1 (hwcap & HWCAP_ATOMICS)
# define IFUNC_COND_1 (hwcap & HWCAP_ATOMICS)
# define IFUNC_NCOND(N) 1
# endif
#else
# define IFUNC_COND_1 (false)
# define IFUNC_NCOND(N) 1
#endif
#define IFUNC_NCOND(N) (1)
#endif /* HAVE_IFUNC */
@ -59,7 +62,7 @@ typedef struct __ifunc_arg_t {
#define MIDR_PARTNUM(midr) (((midr) >> 4) & 0xfff)
static inline bool
ifunc1 (unsigned long hwcap, const __ifunc_arg_t *features)
has_lse2 (unsigned long hwcap, const __ifunc_arg_t *features)
{
if (hwcap & HWCAP_USCAT)
return true;
@ -75,6 +78,37 @@ ifunc1 (unsigned long hwcap, const __ifunc_arg_t *features)
return false;
}
/* LSE128 atomic support encoded in ID_AA64ISAR0_EL1.Atomic,
bits[23:20]. The expected value is 0b0011. Check that. */
#define AT_FEAT_FIELD(isar0) (((isar0) >> 20) & 15)
/* Ensure backwards compatibility with glibc <= 2.38. */
#ifndef HWCAP2_LSE128
#define HWCAP2_LSE128 (1UL << 47)
#endif
static inline bool
has_lse128 (unsigned long hwcap, const __ifunc_arg_t *features)
{
if (hwcap & _IFUNC_ARG_HWCAP
&& features->_hwcap2 & HWCAP2_LSE128)
return true;
/* A 0 HWCAP2_LSE128 bit may be just as much a sign of missing HWCAP2 bit
support in older kernels as it is of CPU feature absence. Try fallback
method to guarantee LSE128 is not implemented.
In the absence of HWCAP_CPUID, we are unable to check for LSE128. */
if (!(hwcap & HWCAP_CPUID))
return false;
unsigned long isar0;
asm volatile ("mrs %0, ID_AA64ISAR0_EL1" : "=r" (isar0));
if (AT_FEAT_FIELD (isar0) >= 3)
return true;
return false;
}
#endif
#include_next <host-config.h>

61
libatomic/configure vendored
View File

@ -656,6 +656,8 @@ LIBAT_BUILD_VERSIONED_SHLIB_FALSE
LIBAT_BUILD_VERSIONED_SHLIB_TRUE
OPT_LDFLAGS
SECTION_LDFLAGS
ARCH_AARCH64_HAVE_LSE128_FALSE
ARCH_AARCH64_HAVE_LSE128_TRUE
SYSROOT_CFLAGS_FOR_TARGET
enable_aarch64_lse
libtool_VERSION
@ -11456,7 +11458,7 @@ else
lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
lt_status=$lt_dlunknown
cat > conftest.$ac_ext <<_LT_EOF
#line 11459 "configure"
#line 11461 "configure"
#include "confdefs.h"
#if HAVE_DLFCN_H
@ -11562,7 +11564,7 @@ else
lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
lt_status=$lt_dlunknown
cat > conftest.$ac_ext <<_LT_EOF
#line 11565 "configure"
#line 11567 "configure"
#include "confdefs.h"
#if HAVE_DLFCN_H
@ -14697,6 +14699,57 @@ _ACEOF
# Check for target-specific assembly-level support for atomic operations.
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for armv9.4-a LSE128 insn support" >&5
$as_echo_n "checking for armv9.4-a LSE128 insn support... " >&6; }
if ${libat_cv_have_feat_lse128+:} false; then :
$as_echo_n "(cached) " >&6
else
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
int
main ()
{
asm(".arch armv9-a+lse128")
;
return 0;
}
_ACEOF
if { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_compile\""; } >&5
(eval $ac_compile) 2>&5
ac_status=$?
$as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
test $ac_status = 0; }; then
eval libat_cv_have_feat_lse128=yes
else
eval libat_cv_have_feat_lse128=no
fi
rm -f conftest*
fi
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libat_cv_have_feat_lse128" >&5
$as_echo "$libat_cv_have_feat_lse128" >&6; }
yesno=`echo $libat_cv_have_feat_lse128 | tr 'yesno' '1 0 '`
cat >>confdefs.h <<_ACEOF
#define HAVE_FEAT_LSE128 $yesno
_ACEOF
if test x$libat_cv_have_feat_lse128 = xyes; then
ARCH_AARCH64_HAVE_LSE128_TRUE=
ARCH_AARCH64_HAVE_LSE128_FALSE='#'
else
ARCH_AARCH64_HAVE_LSE128_TRUE='#'
ARCH_AARCH64_HAVE_LSE128_FALSE=
fi
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether byte ordering is bigendian" >&5
$as_echo_n "checking whether byte ordering is bigendian... " >&6; }
if ${ac_cv_c_bigendian+:} false; then :
@ -15989,6 +16042,10 @@ if test -z "${ENABLE_DARWIN_AT_RPATH_TRUE}" && test -z "${ENABLE_DARWIN_AT_RPATH
as_fn_error $? "conditional \"ENABLE_DARWIN_AT_RPATH\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
if test -z "${ARCH_AARCH64_HAVE_LSE128_TRUE}" && test -z "${ARCH_AARCH64_HAVE_LSE128_FALSE}"; then
as_fn_error $? "conditional \"ARCH_AARCH64_HAVE_LSE128\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
if test -z "${LIBAT_BUILD_VERSIONED_SHLIB_TRUE}" && test -z "${LIBAT_BUILD_VERSIONED_SHLIB_FALSE}"; then
as_fn_error $? "conditional \"LIBAT_BUILD_VERSIONED_SHLIB\" was never defined.

View File

@ -206,6 +206,9 @@ LIBAT_FORALL_MODES([LIBAT_HAVE_ATOMIC_CAS])
LIBAT_FORALL_MODES([LIBAT_HAVE_ATOMIC_FETCH_ADD])
LIBAT_FORALL_MODES([LIBAT_HAVE_ATOMIC_FETCH_OP])
# Check for target-specific assembly-level support for atomic operations.
LIBAT_TEST_FEAT_AARCH64_LSE128()
AC_C_BIGENDIAN
# I don't like the default behaviour of WORDS_BIGENDIAN undefined for LE.
AH_BOTTOM(