Fix FMA4 and XOP insns.

2009-12-02  Sebastian Pop  <sebastian.pop@amd.com>
	    Richard Henderson  <rth@redhat.com>

	* config/i386/i386-protos.h (ix86_fma4_valid_op_p): Removed.
	* config/i386/i386.c (ix86_fma4_valid_op_p): Removed.
	* config/i386/i386.md: Do not use ix86_fma4_valid_op_p.
	* config/i386/sse.md (fma4_*): Remove alternative with operand 1
	matching a memory access.  Do not use ix86_fma4_valid_op_p.
	(xop_*): Same.
	Do not use ix86_fma4_valid_op_p in FMA4 and XOP splitters.

Co-Authored-By: Richard Henderson <rth@redhat.com>

From-SVN: r154970
This commit is contained in:
Sebastian Pop 2009-12-04 05:27:39 +00:00 committed by Sebastian Pop
parent aa356b75ed
commit 4926bb1d60
5 changed files with 385 additions and 625 deletions

View File

@ -1,3 +1,14 @@
2009-12-02 Sebastian Pop <sebastian.pop@amd.com>
Richard Henderson <rth@redhat.com>
* config/i386/i386-protos.h (ix86_fma4_valid_op_p): Removed.
* config/i386/i386.c (ix86_fma4_valid_op_p): Removed.
* config/i386/i386.md: Do not use ix86_fma4_valid_op_p.
* config/i386/sse.md (fma4_*): Remove alternative with operand 1
matching a memory access. Do not use ix86_fma4_valid_op_p.
(xop_*): Same.
Do not use ix86_fma4_valid_op_p in FMA4 and XOP splitters.
2009-12-02 Richard Henderson <rth@redhat.com>
* config/i386/i386.c (ix86_fixup_binary_operands): For FMA4, force

View File

@ -218,8 +218,7 @@ extern void ix86_expand_vector_set (bool, rtx, rtx, int);
extern void ix86_expand_vector_extract (bool, rtx, rtx, int);
extern void ix86_expand_reduc_v4sf (rtx (*)(rtx, rtx, rtx), rtx, rtx);
extern bool ix86_fma4_valid_op_p (rtx [], rtx, int, bool, int, bool);
extern void ix86_expand_fma4_multiple_memory (rtx [], enum machine_mode);
extern bool ix86_expand_fma4_multiple_memory (rtx [], enum machine_mode);
extern void ix86_expand_vec_extract_even_odd (rtx, rtx, rtx, unsigned);

View File

@ -28807,197 +28807,35 @@ ix86_expand_round (rtx operand0, rtx operand1)
emit_move_insn (operand0, res);
}
/* Validate whether a FMA4 instruction is valid or not.
OPERANDS is the array of operands.
NUM is the number of operands.
USES_OC0 is true if the instruction uses OC0 and provides 4 variants.
NUM_MEMORY is the maximum number of memory operands to accept.
NUM_MEMORY less than zero is a special case to allow an operand
of an instruction to be memory operation.
when COMMUTATIVE is set, operand 1 and 2 can be swapped. */
/* Fixup an FMA4 or XOP instruction that has 2 memory input references
into a form the hardware will allow by using the destination
register to load one of the memory operations. Presently this is
used by the multiply/add routines to allow 2 memory references. */
bool
ix86_fma4_valid_op_p (rtx operands[], rtx insn ATTRIBUTE_UNUSED, int num,
bool uses_oc0, int num_memory, bool commutative)
{
int mem_mask;
int mem_count;
int i;
/* Count the number of memory arguments */
mem_mask = 0;
mem_count = 0;
for (i = 0; i < num; i++)
{
enum machine_mode mode = GET_MODE (operands[i]);
if (register_operand (operands[i], mode))
;
else if (memory_operand (operands[i], mode))
{
mem_mask |= (1 << i);
mem_count++;
}
else
{
rtx pattern = PATTERN (insn);
/* allow 0 for pcmov */
if (GET_CODE (pattern) != SET
|| GET_CODE (SET_SRC (pattern)) != IF_THEN_ELSE
|| i < 2
|| operands[i] != CONST0_RTX (mode))
return false;
}
}
/* Special case pmacsdq{l,h} where we allow the 3rd argument to be
a memory operation. */
if (num_memory < 0)
{
num_memory = -num_memory;
if ((mem_mask & (1 << (num-1))) != 0)
{
mem_mask &= ~(1 << (num-1));
mem_count--;
}
}
/* If there were no memory operations, allow the insn */
if (mem_mask == 0)
return true;
/* Do not allow the destination register to be a memory operand. */
else if (mem_mask & (1 << 0))
return false;
/* If there are too many memory operations, disallow the instruction. While
the hardware only allows 1 memory reference, before register allocation
for some insns, we allow two memory operations sometimes in order to allow
code like the following to be optimized:
float fmadd (float *a, float *b, float *c) { return (*a * *b) + *c; }
or similar cases that are vectorized into using the vfmaddss
instruction. */
else if (mem_count > num_memory)
return false;
/* Don't allow more than one memory operation if not optimizing. */
else if (mem_count > 1 && !optimize)
return false;
else if (num == 4 && mem_count == 1)
{
/* formats (destination is the first argument), example vfmaddss:
xmm1, xmm1, xmm2, xmm3/mem
xmm1, xmm1, xmm2/mem, xmm3
xmm1, xmm2, xmm3/mem, xmm1
xmm1, xmm2/mem, xmm3, xmm1 */
if (uses_oc0)
return ((mem_mask == (1 << 1))
|| (mem_mask == (1 << 2))
|| (mem_mask == (1 << 3)));
/* format, example vpmacsdd:
xmm1, xmm2, xmm3/mem, xmm1 */
if (commutative)
return (mem_mask == (1 << 2) || mem_mask == (1 << 1));
else
return (mem_mask == (1 << 2));
}
else if (num == 4 && num_memory == 2)
{
/* If there are two memory operations, we can load one of the memory ops
into the destination register. This is for optimizing the
multiply/add ops, which the combiner has optimized both the multiply
and the add insns to have a memory operation. We have to be careful
that the destination doesn't overlap with the inputs. */
rtx op0 = operands[0];
if (reg_mentioned_p (op0, operands[1])
|| reg_mentioned_p (op0, operands[2])
|| reg_mentioned_p (op0, operands[3]))
return false;
/* formats (destination is the first argument), example vfmaddss:
xmm1, xmm1, xmm2, xmm3/mem
xmm1, xmm1, xmm2/mem, xmm3
xmm1, xmm2, xmm3/mem, xmm1
xmm1, xmm2/mem, xmm3, xmm1
For the oc0 case, we will load either operands[1] or operands[3] into
operands[0], so any combination of 2 memory operands is ok. */
if (uses_oc0)
return true;
/* format, example vpmacsdd:
xmm1, xmm2, xmm3/mem, xmm1
For the integer multiply/add instructions be more restrictive and
require operands[2] and operands[3] to be the memory operands. */
if (commutative)
return (mem_mask == ((1 << 1) | (1 << 3)) || ((1 << 2) | (1 << 3)));
else
return (mem_mask == ((1 << 2) | (1 << 3)));
}
else if (num == 3 && num_memory == 1)
{
/* formats, example vprotb:
xmm1, xmm2, xmm3/mem
xmm1, xmm2/mem, xmm3 */
if (uses_oc0)
return ((mem_mask == (1 << 1)) || (mem_mask == (1 << 2)));
/* format, example vpcomeq:
xmm1, xmm2, xmm3/mem */
else
return (mem_mask == (1 << 2));
}
else
gcc_unreachable ();
return false;
}
/* Fixup an FMA4 instruction that has 2 memory input references into a form the
hardware will allow by using the destination register to load one of the
memory operations. Presently this is used by the multiply/add routines to
allow 2 memory references. */
void
ix86_expand_fma4_multiple_memory (rtx operands[],
enum machine_mode mode)
{
rtx op0 = operands[0];
rtx scratch = operands[0];
if (memory_operand (op0, mode)
|| reg_mentioned_p (op0, operands[1])
|| reg_mentioned_p (op0, operands[2])
|| reg_mentioned_p (op0, operands[3]))
gcc_unreachable ();
gcc_assert (register_operand (operands[0], mode));
gcc_assert (register_operand (operands[1], mode));
gcc_assert (MEM_P (operands[2]) && MEM_P (operands[3]));
/* For 2 memory operands, pick either operands[1] or operands[3] to move into
the destination register. */
if (memory_operand (operands[1], mode))
if (reg_mentioned_p (scratch, operands[1]))
{
emit_move_insn (op0, operands[1]);
operands[1] = op0;
}
else if (memory_operand (operands[3], mode))
{
emit_move_insn (op0, operands[3]);
operands[3] = op0;
if (!can_create_pseudo_p ())
return false;
scratch = gen_reg_rtx (mode);
}
emit_move_insn (scratch, operands[3]);
if (rtx_equal_p (operands[2], operands[3]))
operands[2] = operands[3] = scratch;
else
gcc_unreachable ();
return;
operands[3] = scratch;
return true;
}
/* Table of valid machine attributes. */

View File

@ -19248,7 +19248,7 @@
(match_operand:MODEF 1 "register_operand" "x")
(match_operand:MODEF 2 "register_operand" "x")
(match_operand:MODEF 3 "register_operand" "x")))]
"TARGET_XOP && ix86_fma4_valid_op_p (operands, insn, 4, true, 1, false)"
"TARGET_XOP"
"vpcmov\t{%1, %3, %2, %0|%0, %2, %3, %1}"
[(set_attr "type" "sse4arg")])

File diff suppressed because it is too large Load Diff