arm: Fix MVE ICEs with vector moves and -mpure-code [PR97252]

This fixes around 500 ICEs in the testsuite which can be seen when
testing with -march=armv8.1-m.main+mve -mfloat-abi=hard -mpure-code
(leaving the testsuite free of ICEs in this configuration). All of the
ICEs are in arm_print_operand (which is expecting a mem and gets another
rtx, e.g. a const_vector) when running the output code for
*mve_mov<mode> in alternative 4.

The issue is that MVE vector moves were relying on the arm_reorg pass to
move constant vectors that we can't easily synthesize to the literal
pool. This doesn't work for -mpure-code where the literal pool is
disabled. LLVM puts these in .rodata: I've chosen to do the same here.

With this change, for -mpure-code, we no longer want to allow a constant
on the RHS of a vector load in RA. To achieve this, I added a new
constraint which matches constants only if the literal pool is
available.

gcc/ChangeLog:

	PR target/97252
	* config/arm/arm-protos.h (neon_make_constant): Add generate
	argument to guard emitting insns, default to true.
	* config/arm/arm.c (arm_legitimate_constant_p_1): Reject
	CONST_VECTORs which neon_make_constant can't handle.
	(neon_vdup_constant): Add generate argument, avoid emitting
	insns if it's not set.
	(neon_make_constant): Plumb new generate argument through.
	* config/arm/constraints.md (Ui): New. Use it...
	* config/arm/mve.md (*mve_mov<mode>): ... here.
	* config/arm/vec-common.md (movv8hf): Use neon_make_constant to
	synthesize constants.
This commit is contained in:
Alex Coplan 2021-03-22 14:43:15 +00:00
parent 57d9ac6728
commit 150a829acc
5 changed files with 35 additions and 13 deletions

View File

@ -101,7 +101,7 @@ extern char *neon_output_shift_immediate (const char *, char, rtx *,
machine_mode, int, bool);
extern void neon_pairwise_reduce (rtx, rtx, machine_mode,
rtx (*) (rtx, rtx, rtx));
extern rtx neon_make_constant (rtx);
extern rtx neon_make_constant (rtx, bool generate = true);
extern tree arm_builtin_vectorized_function (unsigned int, tree, tree);
extern void neon_expand_vector_init (rtx, rtx);
extern void neon_lane_bounds (rtx, HOST_WIDE_INT, HOST_WIDE_INT, const_tree);

View File

@ -9461,6 +9461,9 @@ arm_tls_referenced_p (rtx x)
static bool
arm_legitimate_constant_p_1 (machine_mode, rtx x)
{
if (GET_CODE (x) == CONST_VECTOR && !neon_make_constant (x, false))
return false;
return flag_pic || !label_mentioned_p (x);
}
@ -13025,12 +13028,14 @@ neon_pairwise_reduce (rtx op0, rtx op1, machine_mode mode,
}
}
/* If VALS is a vector constant that can be loaded into a register
using VDUP, generate instructions to do so and return an RTX to
assign to the register. Otherwise return NULL_RTX. */
/* Return a non-NULL RTX iff VALS is a vector constant that can be
loaded into a register using VDUP.
If this is the case, and GENERATE is set, we also generate
instructions to do this and return an RTX to assign to the register. */
static rtx
neon_vdup_constant (rtx vals)
neon_vdup_constant (rtx vals, bool generate)
{
machine_mode mode = GET_MODE (vals);
machine_mode inner_mode = GET_MODE_INNER (mode);
@ -13046,6 +13051,9 @@ neon_vdup_constant (rtx vals)
vdup.i16). */
return NULL_RTX;
if (!generate)
return x;
/* We can load this constant by using VDUP and a constant in a
single ARM register. This will be cheaper than a vector
load. */
@ -13054,13 +13062,15 @@ neon_vdup_constant (rtx vals)
return gen_vec_duplicate (mode, x);
}
/* Generate code to load VALS, which is a PARALLEL containing only
constants (for vec_init) or CONST_VECTOR, efficiently into a
register. Returns an RTX to copy into the register, or NULL_RTX
for a PARALLEL that cannot be converted into a CONST_VECTOR. */
/* Return a non-NULL RTX iff VALS, which is a PARALLEL containing only
constants (for vec_init) or CONST_VECTOR, can be effeciently loaded
into a register.
If this is the case, and GENERATE is set, we also generate code to do
this and return an RTX to copy into the register. */
rtx
neon_make_constant (rtx vals)
neon_make_constant (rtx vals, bool generate)
{
machine_mode mode = GET_MODE (vals);
rtx target;
@ -13092,7 +13102,7 @@ neon_make_constant (rtx vals)
&& simd_immediate_valid_for_move (const_vec, mode, NULL, NULL))
/* Load using VMOV. On Cortex-A8 this takes one cycle. */
return const_vec;
else if ((target = neon_vdup_constant (vals)) != NULL_RTX)
else if ((target = neon_vdup_constant (vals, generate)) != NULL_RTX)
/* Loaded using VDUP. On Cortex-A8 the VDUP takes one NEON
pipeline cycle; creating the constant takes one or two ARM
pipeline cycles. */
@ -13102,7 +13112,7 @@ neon_make_constant (rtx vals)
(for either double or quad vectors). We cannot take advantage
of single-cycle VLD1 because we need a PC-relative addressing
mode. */
return const_vec;
return arm_disable_literal_pool ? NULL_RTX : const_vec;
else
/* A PARALLEL containing something not valid inside CONST_VECTOR.
We cannot construct an initializer. */

View File

@ -506,6 +506,13 @@
&& mve_vector_mem_operand (GET_MODE (op),
XEXP (op, 0), true)")))
(define_constraint "Ui"
"@internal
Match a constant (as per the 'i' constraint) provided that we have the
literal pool available. This is useful for load insns that would need
to move such constants to the literal pool after RA."
(match_test "!arm_disable_literal_pool && satisfies_constraint_i (op)"))
(define_memory_constraint "Uq"
"@internal
In ARM state an address valid in ldrsb instructions."

View File

@ -19,7 +19,7 @@
(define_insn "*mve_mov<mode>"
[(set (match_operand:MVE_types 0 "nonimmediate_operand" "=w,w,r,w,w,r,w,Ux,w")
(match_operand:MVE_types 1 "general_operand" "w,r,w,Dn,Uxi,r,Dm,w,Ul"))]
(match_operand:MVE_types 1 "general_operand" "w,r,w,Dn,UxUi,r,Dm,w,Ul"))]
"TARGET_HAVE_MVE || TARGET_HAVE_MVE_FLOAT"
{
if (which_alternative == 3 || which_alternative == 6)

View File

@ -74,6 +74,11 @@
{
if (!REG_P (operands[0]))
operands[1] = force_reg (E_V8HFmode, operands[1]);
else if (TARGET_HAVE_MVE_FLOAT && CONSTANT_P (operands[1]))
{
operands[1] = neon_make_constant (operands[1]);
gcc_assert (operands[1] != NULL_RTX);
}
}
})