mirror of
git://gcc.gnu.org/git/gcc.git
synced 2025-04-23 06:20:25 +08:00
aarch64: Use +mops to inline memset operations
This 3rd patch in the series adds an inline sequence for the memset operation. The aarch64-mops-memset-size-threshold param is added to control the size threshold for the sequence. Its default setting is 256, which may seem a bit high, but it is consistent with the current SIMD memset inline sequence limit, and future CPU tunings can override it easily as needed. Bootstrapped and tested on aarch64-none-linux-gnu. gcc/ChangeLog: * config/aarch64/aarch64.c (aarch64_expand_setmem_mops): Define. (aarch64_expand_setmem): Adjust for TARGET_MOPS. * config/aarch64/aarch64.h (CLEAR_RATIO): Adjust for TARGET_MOPS. (SET_RATIO): Likewise. * config/aarch64/aarch64.md ("unspec"): Add UNSPEC_SETMEM. (aarch64_setmemdi): Define. (setmemdi): Adjust for TARGET_MOPS. * config/aarch64/aarch64.opt (aarch64-mops-memset-size-threshold): New param. gcc/testsuite/ChangeLog: * gcc.target/aarch64/mops_3.c: New test.
This commit is contained in:
parent
bb768f8b45
commit
d3bd985e79
@ -23754,6 +23754,28 @@ aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
|
||||
*dst = aarch64_progress_pointer (*dst);
|
||||
}
|
||||
|
||||
/* Expand a setmem using the MOPS instructions. OPERANDS are the same
|
||||
as for the setmem pattern. Return true iff we succeed. */
|
||||
static bool
|
||||
aarch64_expand_setmem_mops (rtx *operands)
|
||||
{
|
||||
if (!TARGET_MOPS)
|
||||
return false;
|
||||
|
||||
rtx addr_dst = XEXP (operands[0], 0);
|
||||
rtx sz_reg = operands[1];
|
||||
rtx val = operands[2];
|
||||
|
||||
if (!REG_P (sz_reg))
|
||||
sz_reg = force_reg (DImode, sz_reg);
|
||||
if (!REG_P (addr_dst))
|
||||
addr_dst = force_reg (DImode, addr_dst);
|
||||
if (!REG_P (val) && val != CONST0_RTX (QImode))
|
||||
val = force_reg (QImode, val);
|
||||
emit_insn (gen_aarch64_setmemdi (addr_dst, val, sz_reg));
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Expand setmem, as if from a __builtin_memset. Return true if
|
||||
we succeed, otherwise return false. */
|
||||
|
||||
@ -23767,39 +23789,59 @@ aarch64_expand_setmem (rtx *operands)
|
||||
rtx base;
|
||||
machine_mode cur_mode = BLKmode, next_mode;
|
||||
|
||||
/* We can't do anything smart if the amount to copy is not constant. */
|
||||
if (!CONST_INT_P (operands[1]))
|
||||
return false;
|
||||
/* If we don't have SIMD registers or the size is variable use the MOPS
|
||||
inlined sequence if possible. */
|
||||
if (!CONST_INT_P (operands[1]) || !TARGET_SIMD)
|
||||
return aarch64_expand_setmem_mops (operands);
|
||||
|
||||
bool size_p = optimize_function_for_size_p (cfun);
|
||||
|
||||
/* Default the maximum to 256-bytes. */
|
||||
/* Default the maximum to 256-bytes when considering only libcall vs
|
||||
SIMD broadcast sequence. */
|
||||
unsigned max_set_size = 256;
|
||||
|
||||
len = INTVAL (operands[1]);
|
||||
|
||||
/* Upper bound check. */
|
||||
if (len > max_set_size)
|
||||
if (len > max_set_size && !TARGET_MOPS)
|
||||
return false;
|
||||
|
||||
int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
|
||||
/* The MOPS sequence takes:
|
||||
3 instructions for the memory storing
|
||||
+ 1 to move the constant size into a reg
|
||||
+ 1 if VAL is a non-zero constant to move into a reg
|
||||
(zero constants can use XZR directly). */
|
||||
unsigned mops_cost = 3 + 1 + cst_val;
|
||||
/* A libcall to memset in the worst case takes 3 instructions to prepare
|
||||
the arguments + 1 for the call. */
|
||||
unsigned libcall_cost = 4;
|
||||
|
||||
/* Upper bound check. For large constant-sized setmem use the MOPS sequence
|
||||
when available. */
|
||||
if (TARGET_MOPS
|
||||
&& len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold)
|
||||
return aarch64_expand_setmem_mops (operands);
|
||||
|
||||
/* Attempt a sequence with a vector broadcast followed by stores.
|
||||
Count the number of operations involved to see if it's worth it for
|
||||
code size. */
|
||||
Count the number of operations involved to see if it's worth it
|
||||
against the alternatives. A simple counter simd_ops on the
|
||||
algorithmically-relevant operations is used rather than an rtx_insn count
|
||||
as all the pointer adjusmtents and mode reinterprets will be optimized
|
||||
away later. */
|
||||
start_sequence ();
|
||||
unsigned nops = 0;
|
||||
unsigned simd_ops = 0;
|
||||
|
||||
base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
|
||||
dst = adjust_automodify_address (dst, VOIDmode, base, 0);
|
||||
|
||||
/* Prepare the val using a DUP/MOVI v0.16B, val. */
|
||||
src = expand_vector_broadcast (V16QImode, val);
|
||||
src = force_reg (V16QImode, src);
|
||||
nops++;
|
||||
simd_ops++;
|
||||
/* Convert len to bits to make the rest of the code simpler. */
|
||||
n = len * BITS_PER_UNIT;
|
||||
|
||||
/* Maximum amount to copy in one go. We allow 256-bit chunks based on the
|
||||
AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter. setmem expand
|
||||
pattern is only turned on for TARGET_SIMD. */
|
||||
AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter. */
|
||||
const int copy_limit = (aarch64_tune_params.extra_tuning_flags
|
||||
& AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
|
||||
? GET_MODE_BITSIZE (TImode) : 256;
|
||||
@ -23817,7 +23859,7 @@ aarch64_expand_setmem (rtx *operands)
|
||||
|
||||
mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
|
||||
aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
|
||||
nops++;
|
||||
simd_ops++;
|
||||
n -= mode_bits;
|
||||
|
||||
/* Do certain trailing copies as overlapping if it's going to be
|
||||
@ -23835,12 +23877,25 @@ aarch64_expand_setmem (rtx *operands)
|
||||
}
|
||||
rtx_insn *seq = get_insns ();
|
||||
end_sequence ();
|
||||
/* A call to memset in the worst case requires 3 instructions to prepare
|
||||
the arguments + 1 for the call. Prefer the inline sequence for size if
|
||||
it is no longer than that. */
|
||||
if (size_p && nops > 4)
|
||||
return false;
|
||||
|
||||
if (size_p)
|
||||
{
|
||||
/* When optimizing for size we have 3 options: the SIMD broadcast sequence,
|
||||
call to memset or the MOPS expansion. */
|
||||
if (TARGET_MOPS
|
||||
&& mops_cost <= libcall_cost
|
||||
&& mops_cost <= simd_ops)
|
||||
return aarch64_expand_setmem_mops (operands);
|
||||
/* If MOPS is not available or not shorter pick a libcall if the SIMD
|
||||
sequence is too long. */
|
||||
else if (libcall_cost < simd_ops)
|
||||
return false;
|
||||
emit_insn (seq);
|
||||
return true;
|
||||
}
|
||||
|
||||
/* At this point the SIMD broadcast sequence is the best choice when
|
||||
optimizing for speed. */
|
||||
emit_insn (seq);
|
||||
return true;
|
||||
}
|
||||
|
@ -1063,14 +1063,14 @@ typedef struct
|
||||
Otherwise follow a sensible default: when optimizing for size, give a better
|
||||
estimate of the length of a memset call, but use the default otherwise. */
|
||||
#define CLEAR_RATIO(speed) \
|
||||
(!STRICT_ALIGNMENT ? 4 : (speed) ? 15 : AARCH64_CALL_RATIO)
|
||||
(!STRICT_ALIGNMENT ? (TARGET_MOPS ? 0 : 4) : (speed) ? 15 : AARCH64_CALL_RATIO)
|
||||
|
||||
/* SET_RATIO is similar to CLEAR_RATIO, but for a non-zero constant. Without
|
||||
-mstrict-align, make decisions in "setmem". Otherwise follow a sensible
|
||||
default: when optimizing for size adjust the ratio to account for the
|
||||
overhead of loading the constant. */
|
||||
#define SET_RATIO(speed) \
|
||||
(!STRICT_ALIGNMENT ? 0 : (speed) ? 15 : AARCH64_CALL_RATIO - 2)
|
||||
((!STRICT_ALIGNMENT || TARGET_MOPS) ? 0 : (speed) ? 15 : AARCH64_CALL_RATIO - 2)
|
||||
|
||||
/* Disable auto-increment in move_by_pieces et al. Use of auto-increment is
|
||||
rarely a good idea in straight-line code since it adds an extra address
|
||||
|
@ -204,6 +204,7 @@
|
||||
UNSPEC_SABDL2
|
||||
UNSPEC_SADALP
|
||||
UNSPEC_SCVTF
|
||||
UNSPEC_SETMEM
|
||||
UNSPEC_SISD_NEG
|
||||
UNSPEC_SISD_SSHL
|
||||
UNSPEC_SISD_USHL
|
||||
@ -1650,18 +1651,29 @@
|
||||
}
|
||||
)
|
||||
|
||||
(define_insn "aarch64_setmemdi"
|
||||
[(parallel [
|
||||
(set (match_operand:DI 2 "register_operand" "+&r") (const_int 0))
|
||||
(clobber (match_operand:DI 0 "register_operand" "+&r"))
|
||||
(set (mem:BLK (match_dup 0))
|
||||
(unspec:BLK [(match_operand:QI 1 "aarch64_reg_or_zero" "rZ")
|
||||
(match_dup 2)] UNSPEC_SETMEM))])]
|
||||
"TARGET_MOPS"
|
||||
"setp\t[%x0]!, %x2!, %x1\;setm\t[%x0]!, %x2!, %x1\;sete\t[%x0]!, %x2!, %x1"
|
||||
[(set_attr "length" "12")]
|
||||
)
|
||||
|
||||
;; 0 is dst
|
||||
;; 1 is val
|
||||
;; 2 is size of copy in bytes
|
||||
;; 3 is alignment
|
||||
|
||||
(define_expand "setmemdi"
|
||||
[(set (match_operand:BLK 0 "memory_operand") ;; Dest
|
||||
(match_operand:QI 2 "nonmemory_operand")) ;; Value
|
||||
(use (match_operand:DI 1 "immediate_operand")) ;; Length
|
||||
(use (match_operand:DI 1 "general_operand")) ;; Length
|
||||
(match_operand 3 "immediate_operand")] ;; Align
|
||||
"TARGET_SIMD"
|
||||
{
|
||||
"TARGET_SIMD || TARGET_MOPS"
|
||||
{
|
||||
if (aarch64_expand_setmem (operands))
|
||||
DONE;
|
||||
|
||||
|
@ -288,3 +288,7 @@ Constant memcpy size in bytes above which to start using MOPS sequence.
|
||||
-param=aarch64-mops-memmove-size-threshold=
|
||||
Target Joined UInteger Var(aarch64_mops_memmove_size_threshold) Init(0) Param
|
||||
Constant memmove size in bytes above which to start using MOPS sequence.
|
||||
|
||||
-param=aarch64-mops-memset-size-threshold=
|
||||
Target Joined UInteger Var(aarch64_mops_memset_size_threshold) Init(256) Param
|
||||
Constant memset size in bytes from which to start using MOPS sequence.
|
||||
|
85
gcc/testsuite/gcc.target/aarch64/mops_3.c
Normal file
85
gcc/testsuite/gcc.target/aarch64/mops_3.c
Normal file
@ -0,0 +1,85 @@
|
||||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2 -march=armv8.6-a+mops --param=aarch64-mops-memset-size-threshold=0" } */
|
||||
/* { dg-final { check-function-bodies "**" "" "" } } */
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
/* We want to inline variable-sized memset.
|
||||
** do_it_set:
|
||||
** setp \[x0\]\!, x2\!, x1
|
||||
** setm \[x0\]\!, x2\!, x1
|
||||
** sete \[x0\]\!, x2\!, x1
|
||||
** ret
|
||||
*/
|
||||
void do_it_set (char * out, int n, size_t size)
|
||||
{
|
||||
__builtin_memset (out, n, size);
|
||||
}
|
||||
|
||||
/*
|
||||
** do_it_set_large:
|
||||
** mov w2, 1
|
||||
** mov x1, 1024
|
||||
** setp \[x0\]\!, x1\!, x2
|
||||
** setm \[x0\]\!, x1\!, x2
|
||||
** sete \[x0\]\!, x1\!, x2
|
||||
** ret
|
||||
*/
|
||||
void do_it_set_large (char * out)
|
||||
{
|
||||
__builtin_memset (out, 1, 1024);
|
||||
}
|
||||
|
||||
/*
|
||||
** do_it_set_256:
|
||||
** mov w2, 1
|
||||
** mov x1, 256
|
||||
** setp \[x0\]\!, x1\!, x2
|
||||
** setm \[x0\]\!, x1\!, x2
|
||||
** sete \[x0\]\!, x1\!, x2
|
||||
** ret
|
||||
*/
|
||||
void do_it_set_256 (char * out)
|
||||
{
|
||||
__builtin_memset (out, 1, 256);
|
||||
}
|
||||
|
||||
/*
|
||||
** do_it_set_255:
|
||||
** mov w2, 1
|
||||
** mov x1, 255
|
||||
** setp \[x0\]\!, x1\!, x2
|
||||
** setm \[x0\]\!, x1\!, x2
|
||||
** sete \[x0\]\!, x1\!, x2
|
||||
** ret
|
||||
*/
|
||||
void do_it_set_255 (char * out)
|
||||
{
|
||||
__builtin_memset (out, 1, 255);
|
||||
}
|
||||
|
||||
/*
|
||||
** do_it_set_0:
|
||||
** setp \[x0\]\!, x1\!, xzr
|
||||
** setm \[x0\]\!, x1\!, xzr
|
||||
** sete \[x0\]\!, x1\!, xzr
|
||||
** ret
|
||||
*/
|
||||
void do_it_set_0 (char * out, size_t n)
|
||||
{
|
||||
__builtin_memset (out, 0, n);
|
||||
}
|
||||
|
||||
/*
|
||||
** do_it_set_0_255:
|
||||
** mov x1, 255
|
||||
** setp \[x0\]\!, x1\!, xzr
|
||||
** setm \[x0\]\!, x1\!, xzr
|
||||
** sete \[x0\]\!, x1\!, xzr
|
||||
** ret
|
||||
*/
|
||||
void do_it_set_0_255 (char * out)
|
||||
{
|
||||
__builtin_memset (out, 0, 255);
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user