aarch64: Use +mops to inline memset operations

This 3rd patch in the series adds an inline sequence for the memset operation.
The aarch64-mops-memset-size-threshold param is added to control the size threshold for the sequence.
Its default setting is 256, which may seem a bit high, but it is consistent with the current
SIMD memset inline sequence limit, and future CPU tunings can override it easily as needed.

Bootstrapped and tested on aarch64-none-linux-gnu.

gcc/ChangeLog:

	* config/aarch64/aarch64.c (aarch64_expand_setmem_mops): Define.
	(aarch64_expand_setmem): Adjust for TARGET_MOPS.
	* config/aarch64/aarch64.h (CLEAR_RATIO): Adjust for TARGET_MOPS.
	(SET_RATIO): Likewise.
	* config/aarch64/aarch64.md ("unspec"): Add UNSPEC_SETMEM.
	(aarch64_setmemdi): Define.
	(setmemdi): Adjust for TARGET_MOPS.
	* config/aarch64/aarch64.opt (aarch64-mops-memset-size-threshold):
	New param.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/mops_3.c: New test.
This commit is contained in:
Kyrylo Tkachov 2021-12-13 14:14:21 +00:00
parent bb768f8b45
commit d3bd985e79
5 changed files with 181 additions and 25 deletions

View File

@ -23754,6 +23754,28 @@ aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
*dst = aarch64_progress_pointer (*dst);
}
/* Expand a setmem using the MOPS instructions. OPERANDS are the same
as for the setmem pattern. Return true iff we succeed. */
static bool
aarch64_expand_setmem_mops (rtx *operands)
{
if (!TARGET_MOPS)
return false;
rtx addr_dst = XEXP (operands[0], 0);
rtx sz_reg = operands[1];
rtx val = operands[2];
if (!REG_P (sz_reg))
sz_reg = force_reg (DImode, sz_reg);
if (!REG_P (addr_dst))
addr_dst = force_reg (DImode, addr_dst);
if (!REG_P (val) && val != CONST0_RTX (QImode))
val = force_reg (QImode, val);
emit_insn (gen_aarch64_setmemdi (addr_dst, val, sz_reg));
return true;
}
/* Expand setmem, as if from a __builtin_memset. Return true if
we succeed, otherwise return false. */
@ -23767,39 +23789,59 @@ aarch64_expand_setmem (rtx *operands)
rtx base;
machine_mode cur_mode = BLKmode, next_mode;
/* We can't do anything smart if the amount to copy is not constant. */
if (!CONST_INT_P (operands[1]))
return false;
/* If we don't have SIMD registers or the size is variable use the MOPS
inlined sequence if possible. */
if (!CONST_INT_P (operands[1]) || !TARGET_SIMD)
return aarch64_expand_setmem_mops (operands);
bool size_p = optimize_function_for_size_p (cfun);
/* Default the maximum to 256-bytes. */
/* Default the maximum to 256-bytes when considering only libcall vs
SIMD broadcast sequence. */
unsigned max_set_size = 256;
len = INTVAL (operands[1]);
/* Upper bound check. */
if (len > max_set_size)
if (len > max_set_size && !TARGET_MOPS)
return false;
int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
/* The MOPS sequence takes:
3 instructions for the memory storing
+ 1 to move the constant size into a reg
+ 1 if VAL is a non-zero constant to move into a reg
(zero constants can use XZR directly). */
unsigned mops_cost = 3 + 1 + cst_val;
/* A libcall to memset in the worst case takes 3 instructions to prepare
the arguments + 1 for the call. */
unsigned libcall_cost = 4;
/* Upper bound check. For large constant-sized setmem use the MOPS sequence
when available. */
if (TARGET_MOPS
&& len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold)
return aarch64_expand_setmem_mops (operands);
/* Attempt a sequence with a vector broadcast followed by stores.
Count the number of operations involved to see if it's worth it for
code size. */
Count the number of operations involved to see if it's worth it
against the alternatives. A simple counter simd_ops on the
algorithmically-relevant operations is used rather than an rtx_insn count
as all the pointer adjusmtents and mode reinterprets will be optimized
away later. */
start_sequence ();
unsigned nops = 0;
unsigned simd_ops = 0;
base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
dst = adjust_automodify_address (dst, VOIDmode, base, 0);
/* Prepare the val using a DUP/MOVI v0.16B, val. */
src = expand_vector_broadcast (V16QImode, val);
src = force_reg (V16QImode, src);
nops++;
simd_ops++;
/* Convert len to bits to make the rest of the code simpler. */
n = len * BITS_PER_UNIT;
/* Maximum amount to copy in one go. We allow 256-bit chunks based on the
AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter. setmem expand
pattern is only turned on for TARGET_SIMD. */
AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter. */
const int copy_limit = (aarch64_tune_params.extra_tuning_flags
& AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
? GET_MODE_BITSIZE (TImode) : 256;
@ -23817,7 +23859,7 @@ aarch64_expand_setmem (rtx *operands)
mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
nops++;
simd_ops++;
n -= mode_bits;
/* Do certain trailing copies as overlapping if it's going to be
@ -23835,12 +23877,25 @@ aarch64_expand_setmem (rtx *operands)
}
rtx_insn *seq = get_insns ();
end_sequence ();
/* A call to memset in the worst case requires 3 instructions to prepare
the arguments + 1 for the call. Prefer the inline sequence for size if
it is no longer than that. */
if (size_p && nops > 4)
return false;
if (size_p)
{
/* When optimizing for size we have 3 options: the SIMD broadcast sequence,
call to memset or the MOPS expansion. */
if (TARGET_MOPS
&& mops_cost <= libcall_cost
&& mops_cost <= simd_ops)
return aarch64_expand_setmem_mops (operands);
/* If MOPS is not available or not shorter pick a libcall if the SIMD
sequence is too long. */
else if (libcall_cost < simd_ops)
return false;
emit_insn (seq);
return true;
}
/* At this point the SIMD broadcast sequence is the best choice when
optimizing for speed. */
emit_insn (seq);
return true;
}

View File

@ -1063,14 +1063,14 @@ typedef struct
Otherwise follow a sensible default: when optimizing for size, give a better
estimate of the length of a memset call, but use the default otherwise. */
#define CLEAR_RATIO(speed) \
(!STRICT_ALIGNMENT ? 4 : (speed) ? 15 : AARCH64_CALL_RATIO)
(!STRICT_ALIGNMENT ? (TARGET_MOPS ? 0 : 4) : (speed) ? 15 : AARCH64_CALL_RATIO)
/* SET_RATIO is similar to CLEAR_RATIO, but for a non-zero constant. Without
-mstrict-align, make decisions in "setmem". Otherwise follow a sensible
default: when optimizing for size adjust the ratio to account for the
overhead of loading the constant. */
#define SET_RATIO(speed) \
(!STRICT_ALIGNMENT ? 0 : (speed) ? 15 : AARCH64_CALL_RATIO - 2)
((!STRICT_ALIGNMENT || TARGET_MOPS) ? 0 : (speed) ? 15 : AARCH64_CALL_RATIO - 2)
/* Disable auto-increment in move_by_pieces et al. Use of auto-increment is
rarely a good idea in straight-line code since it adds an extra address

View File

@ -204,6 +204,7 @@
UNSPEC_SABDL2
UNSPEC_SADALP
UNSPEC_SCVTF
UNSPEC_SETMEM
UNSPEC_SISD_NEG
UNSPEC_SISD_SSHL
UNSPEC_SISD_USHL
@ -1650,18 +1651,29 @@
}
)
(define_insn "aarch64_setmemdi"
[(parallel [
(set (match_operand:DI 2 "register_operand" "+&r") (const_int 0))
(clobber (match_operand:DI 0 "register_operand" "+&r"))
(set (mem:BLK (match_dup 0))
(unspec:BLK [(match_operand:QI 1 "aarch64_reg_or_zero" "rZ")
(match_dup 2)] UNSPEC_SETMEM))])]
"TARGET_MOPS"
"setp\t[%x0]!, %x2!, %x1\;setm\t[%x0]!, %x2!, %x1\;sete\t[%x0]!, %x2!, %x1"
[(set_attr "length" "12")]
)
;; 0 is dst
;; 1 is val
;; 2 is size of copy in bytes
;; 3 is alignment
(define_expand "setmemdi"
[(set (match_operand:BLK 0 "memory_operand") ;; Dest
(match_operand:QI 2 "nonmemory_operand")) ;; Value
(use (match_operand:DI 1 "immediate_operand")) ;; Length
(use (match_operand:DI 1 "general_operand")) ;; Length
(match_operand 3 "immediate_operand")] ;; Align
"TARGET_SIMD"
{
"TARGET_SIMD || TARGET_MOPS"
{
if (aarch64_expand_setmem (operands))
DONE;

View File

@ -288,3 +288,7 @@ Constant memcpy size in bytes above which to start using MOPS sequence.
-param=aarch64-mops-memmove-size-threshold=
Target Joined UInteger Var(aarch64_mops_memmove_size_threshold) Init(0) Param
Constant memmove size in bytes above which to start using MOPS sequence.
-param=aarch64-mops-memset-size-threshold=
Target Joined UInteger Var(aarch64_mops_memset_size_threshold) Init(256) Param
Constant memset size in bytes from which to start using MOPS sequence.

View File

@ -0,0 +1,85 @@
/* { dg-do compile } */
/* { dg-options "-O2 -march=armv8.6-a+mops --param=aarch64-mops-memset-size-threshold=0" } */
/* { dg-final { check-function-bodies "**" "" "" } } */
#include <stdlib.h>
/* We want to inline variable-sized memset.
** do_it_set:
** setp \[x0\]\!, x2\!, x1
** setm \[x0\]\!, x2\!, x1
** sete \[x0\]\!, x2\!, x1
** ret
*/
void do_it_set (char * out, int n, size_t size)
{
__builtin_memset (out, n, size);
}
/*
** do_it_set_large:
** mov w2, 1
** mov x1, 1024
** setp \[x0\]\!, x1\!, x2
** setm \[x0\]\!, x1\!, x2
** sete \[x0\]\!, x1\!, x2
** ret
*/
void do_it_set_large (char * out)
{
__builtin_memset (out, 1, 1024);
}
/*
** do_it_set_256:
** mov w2, 1
** mov x1, 256
** setp \[x0\]\!, x1\!, x2
** setm \[x0\]\!, x1\!, x2
** sete \[x0\]\!, x1\!, x2
** ret
*/
void do_it_set_256 (char * out)
{
__builtin_memset (out, 1, 256);
}
/*
** do_it_set_255:
** mov w2, 1
** mov x1, 255
** setp \[x0\]\!, x1\!, x2
** setm \[x0\]\!, x1\!, x2
** sete \[x0\]\!, x1\!, x2
** ret
*/
void do_it_set_255 (char * out)
{
__builtin_memset (out, 1, 255);
}
/*
** do_it_set_0:
** setp \[x0\]\!, x1\!, xzr
** setm \[x0\]\!, x1\!, xzr
** sete \[x0\]\!, x1\!, xzr
** ret
*/
void do_it_set_0 (char * out, size_t n)
{
__builtin_memset (out, 0, n);
}
/*
** do_it_set_0_255:
** mov x1, 255
** setp \[x0\]\!, x1\!, xzr
** setm \[x0\]\!, x1\!, xzr
** sete \[x0\]\!, x1\!, xzr
** ret
*/
void do_it_set_0_255 (char * out)
{
__builtin_memset (out, 0, 255);
}