Implement vec_perm broadcast, and tidy lots of patterns to help.

From-SVN: r154836
This commit is contained in:
Richard Henderson 2009-11-30 10:26:55 -08:00 committed by Richard Henderson
parent 9fda11a2ec
commit 5e04b3b694
5 changed files with 637 additions and 308 deletions

View File

@ -1,3 +1,50 @@
2009-11-30 Richard Henderson <rth@redhat.com>
* config/i386/i386.c (ix86_vec_interleave_v2df_operator_ok): New.
(bdesc_special_args): Update insn codes.
(avx_vpermilp_parallel): Correct range check.
(ix86_rtx_costs): Handle vector permutation rtx codes.
(struct expand_vec_perm_d): Move earlier.
(get_mode_wider_vector): New.
(expand_vec_perm_broadcast_1): New.
(ix86_expand_vector_init_duplicate): Use it. Tidy AVX modes.
(expand_vec_perm_broadcast): New.
(ix86_expand_vec_perm_builtin_1): Use it.
* config/i386/i386-protos.h: Update.
* config/i386/predicates.md (avx_vbroadcast_operand): New.
* config/i386/sse.md (AVX256MODE24P): New.
(ssescalarmodesuffix2s): New.
(avxhalfvecmode, avxscalarmode): Fill out to all modes.
(avxmodesuffixf2c): Add V8SI, V4DI.
(vec_dupv4sf): New expander.
(*vec_dupv4sf_avx): Add vbroadcastss alternative.
(*vec_set<mode>_0_avx, **vec_set<mode>_0_sse4_1): Macro-ize for
V4SF and V4SI. Move C alternatives to front. Add insertps and
pinsrd alternatives.
(*vec_set<mode>_0_sse2): Split out from ...
(vec_set<mode>_0): Macro-ize for V4SF and V4SI.
(vec_interleave_highv2df, vec_interleave_lowv2df): Require register
destination; use ix86_vec_interleave_v2df_operator_ok, instead of
ix86_fixup_binary_operands.
(*avx_interleave_highv2df, avx_interleave_lowv2df): Add movddup.
(*sse3_interleave_highv2df, sse3_interleave_lowv2df): New.
(*avx_movddup, *sse3_movddup): Remove. New splitter from
vec_select form to vec_duplicate form.
(*sse2_interleave_highv2df, sse2_interleave_lowv2df): Use
ix86_vec_interleave_v2df_operator_ok.
(avx_movddup256, avx_unpcklpd256): Change to expanders, merge into ...
(*avx_unpcklpd256): ... here.
(*vec_dupv4si_avx): New.
(*vec_dupv2di_avx): Add movddup alternative.
(*vec_dupv2di_sse3): New.
(vec_dup<AVX256MODE24P>): Replace avx_vbroadcasts<AVXMODEF4P> and
avx_vbroadcastss256; represent with vec_duplicate instead of
nested vec_concat operations.
(avx_vbroadcastf128_<mode>): Rename from
avx_vbroadcastf128_p<avxmodesuffixf2c>256.
(*avx_vperm_broadcast_v4sf): New.
(*avx_vperm_broadcast_<AVX256MODEF2P>): New.
2009-11-30 Martin Jambor <mjambor@suse.cz>
PR middle-end/42196

View File

@ -86,6 +86,7 @@ extern void ix86_expand_binary_operator (enum rtx_code,
enum machine_mode, rtx[]);
extern int ix86_binary_operator_ok (enum rtx_code, enum machine_mode, rtx[]);
extern bool ix86_lea_for_add_ok (enum rtx_code, rtx, rtx[]);
extern bool ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high);
extern bool ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn);
extern bool ix86_agi_dependent (rtx set_insn, rtx use_insn);
extern void ix86_expand_unary_operator (enum rtx_code, enum machine_mode,

View File

@ -13849,6 +13849,19 @@ ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
return TRUE;
}
/* Return TRUE if the operands to a vec_interleave_{high,low}v2df
are ok, keeping in mind the possible movddup alternative. */
bool
ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
{
if (MEM_P (operands[0]))
return rtx_equal_p (operands[0], operands[1 + high]);
if (MEM_P (operands[1]) && MEM_P (operands[2]))
return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
return true;
}
/* Post-reload splitter for converting an SF or DFmode value in an
SSE register into an unsigned SImode. */
@ -21480,11 +21493,11 @@ static const struct builtin_description bdesc_special_args[] =
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastss, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastsd256, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastss256, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_pd256, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_ps256, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
{ OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
{ OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
{ OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
@ -24597,7 +24610,7 @@ avx_vpermilp_parallel (rtx par, enum machine_mode mode)
if (!CONST_INT_P (er))
return 0;
ei = INTVAL (er);
if (ei >= nelt)
if (ei >= 2 * nelt)
return 0;
ipar[i] = ei;
}
@ -25713,6 +25726,16 @@ ix86_rtx_costs (rtx x, int code, int outer_code_i, int *total, bool speed)
*total = 0;
return false;
case VEC_SELECT:
case VEC_CONCAT:
case VEC_MERGE:
case VEC_DUPLICATE:
/* ??? Assume all of these vector manipulation patterns are
recognizable. In which case they all pretty much have the
same cost. */
*total = COSTS_N_INSNS (1);
return true;
default:
return false;
}
@ -26547,16 +26570,43 @@ x86_emit_floatuns (rtx operands[2])
emit_label (donelab);
}
/* AVX does not support 32-byte integer vector operations,
thus the longest vector we are faced with is V16QImode. */
#define MAX_VECT_LEN 16
struct expand_vec_perm_d
{
rtx target, op0, op1;
unsigned char perm[MAX_VECT_LEN];
enum machine_mode vmode;
unsigned char nelt;
bool testing_p;
};
static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
/* Get a vector mode of the same size as the original but with elements
twice as wide. This is only guaranteed to apply to integral vectors. */
static inline enum machine_mode
get_mode_wider_vector (enum machine_mode o)
{
/* ??? Rely on the ordering that genmodes.c gives to vectors. */
enum machine_mode n = GET_MODE_WIDER_MODE (o);
gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
return n;
}
/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
with all elements equal to VAR. Return true if successful. */
/* ??? Call into the vec_perm support to implement the broadcast. */
static bool
ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
rtx target, rtx val)
{
enum machine_mode hmode, smode, wsmode, wvmode;
rtx x;
bool ok;
switch (mode)
{
@ -26566,13 +26616,28 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
return false;
/* FALLTHRU */
case V4DFmode:
case V4DImode:
case V8SFmode:
case V8SImode:
case V2DFmode:
case V2DImode:
case V4SFmode:
case V4SImode:
val = force_reg (GET_MODE_INNER (mode), val);
x = gen_rtx_VEC_DUPLICATE (mode, val);
emit_insn (gen_rtx_SET (VOIDmode, target, x));
{
rtx insn, dup;
/* First attempt to recognize VAL as-is. */
dup = gen_rtx_VEC_DUPLICATE (mode, val);
insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
if (recog_memoized (insn) < 0)
{
/* If that fails, force VAL into a register. */
XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
ok = recog_memoized (insn) >= 0;
gcc_assert (ok);
}
}
return true;
case V4HImode:
@ -26580,130 +26645,87 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
return false;
if (TARGET_SSE || TARGET_3DNOW_A)
{
rtx x;
val = gen_lowpart (SImode, val);
x = gen_rtx_TRUNCATE (HImode, val);
x = gen_rtx_VEC_DUPLICATE (mode, x);
emit_insn (gen_rtx_SET (VOIDmode, target, x));
return true;
}
else
{
smode = HImode;
wsmode = SImode;
wvmode = V2SImode;
goto widen;
}
goto widen;
case V8QImode:
if (!mmx_ok)
return false;
smode = QImode;
wsmode = HImode;
wvmode = V4HImode;
goto widen;
case V8HImode:
if (TARGET_SSE2)
{
struct expand_vec_perm_d dperm;
rtx tmp1, tmp2;
/* Extend HImode to SImode using a paradoxical SUBREG. */
permute:
memset (&dperm, 0, sizeof (dperm));
dperm.target = target;
dperm.vmode = mode;
dperm.nelt = GET_MODE_NUNITS (mode);
dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
/* Extend to SImode using a paradoxical SUBREG. */
tmp1 = gen_reg_rtx (SImode);
emit_move_insn (tmp1, gen_lowpart (SImode, val));
/* Insert the SImode value as low element of V4SImode vector. */
tmp2 = gen_reg_rtx (V4SImode);
tmp1 = gen_rtx_VEC_MERGE (V4SImode,
gen_rtx_VEC_DUPLICATE (V4SImode, tmp1),
CONST0_RTX (V4SImode),
const1_rtx);
emit_insn (gen_rtx_SET (VOIDmode, tmp2, tmp1));
/* Cast the V4SImode vector back to a V8HImode vector. */
tmp1 = gen_reg_rtx (V8HImode);
emit_move_insn (tmp1, gen_lowpart (V8HImode, tmp2));
/* Duplicate the low short through the whole low SImode word. */
emit_insn (gen_vec_interleave_lowv8hi (tmp1, tmp1, tmp1));
/* Cast the V8HImode vector back to a V4SImode vector. */
tmp2 = gen_reg_rtx (V4SImode);
emit_move_insn (tmp2, gen_lowpart (V4SImode, tmp1));
/* Replicate the low element of the V4SImode vector. */
emit_insn (gen_sse2_pshufd (tmp2, tmp2, const0_rtx));
/* Cast the V2SImode back to V8HImode, and store in target. */
emit_move_insn (target, gen_lowpart (V8HImode, tmp2));
return true;
/* Insert the SImode value as low element of a V4SImode vector. */
tmp2 = gen_lowpart (V4SImode, dperm.op0);
emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
ok = (expand_vec_perm_1 (&dperm)
|| expand_vec_perm_broadcast_1 (&dperm));
gcc_assert (ok);
return ok;
}
smode = HImode;
wsmode = SImode;
wvmode = V4SImode;
goto widen;
case V16QImode:
if (TARGET_SSE2)
{
rtx tmp1, tmp2;
/* Extend QImode to SImode using a paradoxical SUBREG. */
tmp1 = gen_reg_rtx (SImode);
emit_move_insn (tmp1, gen_lowpart (SImode, val));
/* Insert the SImode value as low element of V4SImode vector. */
tmp2 = gen_reg_rtx (V4SImode);
tmp1 = gen_rtx_VEC_MERGE (V4SImode,
gen_rtx_VEC_DUPLICATE (V4SImode, tmp1),
CONST0_RTX (V4SImode),
const1_rtx);
emit_insn (gen_rtx_SET (VOIDmode, tmp2, tmp1));
/* Cast the V4SImode vector back to a V16QImode vector. */
tmp1 = gen_reg_rtx (V16QImode);
emit_move_insn (tmp1, gen_lowpart (V16QImode, tmp2));
/* Duplicate the low byte through the whole low SImode word. */
emit_insn (gen_vec_interleave_lowv16qi (tmp1, tmp1, tmp1));
emit_insn (gen_vec_interleave_lowv16qi (tmp1, tmp1, tmp1));
/* Cast the V16QImode vector back to a V4SImode vector. */
tmp2 = gen_reg_rtx (V4SImode);
emit_move_insn (tmp2, gen_lowpart (V4SImode, tmp1));
/* Replicate the low element of the V4SImode vector. */
emit_insn (gen_sse2_pshufd (tmp2, tmp2, const0_rtx));
/* Cast the V2SImode back to V16QImode, and store in target. */
emit_move_insn (target, gen_lowpart (V16QImode, tmp2));
return true;
}
smode = QImode;
wsmode = HImode;
wvmode = V8HImode;
goto permute;
goto widen;
widen:
/* Replicate the value once into the next wider mode and recurse. */
val = convert_modes (wsmode, smode, val, true);
x = expand_simple_binop (wsmode, ASHIFT, val,
GEN_INT (GET_MODE_BITSIZE (smode)),
NULL_RTX, 1, OPTAB_LIB_WIDEN);
val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
x = gen_reg_rtx (wvmode);
if (!ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val))
gcc_unreachable ();
emit_move_insn (target, gen_lowpart (mode, x));
return true;
case V4DFmode:
hmode = V2DFmode;
goto half;
case V4DImode:
hmode = V2DImode;
goto half;
case V8SFmode:
hmode = V4SFmode;
goto half;
case V8SImode:
hmode = V4SImode;
goto half;
case V16HImode:
hmode = V8HImode;
goto half;
case V32QImode:
hmode = V16QImode;
goto half;
half:
{
rtx tmp = gen_reg_rtx (hmode);
ix86_expand_vector_init_duplicate (mmx_ok, hmode, tmp, val);
emit_insn (gen_rtx_SET (VOIDmode, target,
gen_rtx_VEC_CONCAT (mode, tmp, tmp)));
enum machine_mode smode, wsmode, wvmode;
rtx x;
smode = GET_MODE_INNER (mode);
wvmode = get_mode_wider_vector (mode);
wsmode = GET_MODE_INNER (wvmode);
val = convert_modes (wsmode, smode, val, true);
x = expand_simple_binop (wsmode, ASHIFT, val,
GEN_INT (GET_MODE_BITSIZE (smode)),
NULL_RTX, 1, OPTAB_LIB_WIDEN);
val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
x = gen_lowpart (wvmode, target);
ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
gcc_assert (ok);
return ok;
}
case V16HImode:
case V32QImode:
{
enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
rtx x = gen_reg_rtx (hvmode);
ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
gcc_assert (ok);
x = gen_rtx_VEC_CONCAT (mode, x, x);
emit_insn (gen_rtx_SET (VOIDmode, target, x));
}
return true;
@ -29085,19 +29107,6 @@ ix86_vectorize_builtin_vec_perm (tree vec_type, tree *mask_type)
return ix86_builtins[(int) fcode];
}
/* AVX does not support 32-byte integer vector operations,
thus the longest vector we are faced with is V16QImode. */
#define MAX_VECT_LEN 16
struct expand_vec_perm_d
{
rtx target, op0, op1;
unsigned char perm[MAX_VECT_LEN];
enum machine_mode vmode;
unsigned char nelt;
bool testing_p;
};
/* Return a vector mode with twice as many elements as VMODE. */
/* ??? Consider moving this to a table generated by genmodes.c. */
@ -29739,8 +29748,8 @@ expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
return true;
}
/* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
extract-even and extract-odd permutations. */
/* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
and extract-odd permutations. */
static bool
expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
@ -29855,6 +29864,9 @@ expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
return true;
}
/* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
extract-even and extract-odd permutations. */
static bool
expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
{
@ -29871,6 +29883,84 @@ expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
return expand_vec_perm_even_odd_1 (d, odd);
}
/* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
permutations. We assume that expand_vec_perm_1 has already failed. */
static bool
expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
{
unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
enum machine_mode vmode = d->vmode;
unsigned char perm2[4];
rtx op0 = d->op0;
bool ok;
switch (vmode)
{
case V4DFmode:
case V8SFmode:
/* These are special-cased in sse.md so that we can optionally
use the vbroadcast instruction. They expand to two insns
if the input happens to be in a register. */
gcc_unreachable ();
case V2DFmode:
case V2DImode:
case V4SFmode:
case V4SImode:
/* These are always implementable using standard shuffle patterns. */
gcc_unreachable ();
case V8HImode:
case V16QImode:
/* These can be implemented via interleave. We save one insn by
stopping once we have promoted to V4SImode and then use pshufd. */
do
{
optab otab = vec_interleave_low_optab;
if (elt >= nelt2)
{
otab = vec_interleave_high_optab;
elt -= nelt2;
}
nelt2 /= 2;
op0 = expand_binop (vmode, otab, op0, op0, NULL, 0, OPTAB_DIRECT);
vmode = get_mode_wider_vector (vmode);
op0 = gen_lowpart (vmode, op0);
}
while (vmode != V4SImode);
memset (perm2, elt, 4);
ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4);
gcc_assert (ok);
return true;
default:
gcc_unreachable ();
}
}
/* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
broadcast permutations. */
static bool
expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
{
unsigned i, elt, nelt = d->nelt;
if (d->op0 != d->op1)
return false;
elt = d->perm[0];
for (i = 1; i < nelt; ++i)
if (d->perm[i] != elt)
return false;
return expand_vec_perm_broadcast_1 (d);
}
/* The guts of ix86_expand_vec_perm_builtin, also used by the ok hook.
With all of the interface bits taken care of, perform the expansion
in D and return true on success. */
@ -29878,8 +29968,7 @@ expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
static bool
ix86_expand_vec_perm_builtin_1 (struct expand_vec_perm_d *d)
{
/* First things first -- check if the instruction is implementable
with a single instruction. */
/* Try a single instruction expansion. */
if (expand_vec_perm_1 (d))
return true;
@ -29894,13 +29983,16 @@ ix86_expand_vec_perm_builtin_1 (struct expand_vec_perm_d *d)
if (expand_vec_perm_interleave2 (d))
return true;
if (expand_vec_perm_broadcast (d))
return true;
/* Try sequences of three instructions. */
if (expand_vec_perm_pshufb2 (d))
return true;
/* ??? Look for narrow permutations whose element orderings would
allow the promition to a wider mode. */
allow the promotion to a wider mode. */
/* ??? Look for sequences of interleave or a wider permute that place
the data into the correct lanes for a half-vector shuffle like
@ -29912,8 +30004,6 @@ ix86_expand_vec_perm_builtin_1 (struct expand_vec_perm_d *d)
if (expand_vec_perm_even_odd (d))
return true;
/* ??? Pattern match broadcast. */
return false;
}

View File

@ -1241,3 +1241,20 @@
(define_predicate "avx_vperm2f128_v4df_operand"
(and (match_code "parallel")
(match_test "avx_vperm2f128_parallel (op, V4DFmode)")))
;; Return 1 if OP is a parallel for a vbroadcast permute.
(define_predicate "avx_vbroadcast_operand"
(and (match_code "parallel")
(match_code "const_int" "a"))
{
rtx elt = XVECEXP (op, 0, 0);
int i, nelt = XVECLEN (op, 0);
/* Don't bother checking there are the right number of operands,
merely that they're all identical. */
for (i = 1; i < nelt; ++i)
if (XVECEXP (op, 0, i) != elt)
return false;
return true;
})

View File

@ -54,6 +54,7 @@
(define_mode_iterator AVX256MODEF2P [V8SF V4DF])
(define_mode_iterator AVX256MODE2P [V8SI V8SF V4DF])
(define_mode_iterator AVX256MODE24P [V8SI V8SF V4DI V4DF])
(define_mode_iterator AVX256MODE4P [V4DI V4DF])
(define_mode_iterator AVX256MODE8P [V8SI V8SF])
(define_mode_iterator AVXMODEF2P [V4SF V2DF V8SF V4DF])
@ -96,6 +97,8 @@
(define_mode_attr ssemodesuffixf2c [(V4SF "s") (V2DF "d")])
(define_mode_attr ssescalarmodesuffix2s [(V4SF "ss") (V4SI "d")])
;; Mapping of the max integer size for xop rotate immediate constraint
(define_mode_attr sserotatemax [(V16QI "7") (V8HI "15") (V4SI "31") (V2DI "63")])
@ -125,17 +128,18 @@
[(V16QI "V4SF") (V8HI "V4SF") (V4SI "V4SF") (V2DI "V4SF")
(V32QI "V8SF") (V16HI "V8SF") (V8SI "V8SF") (V4DI "V8SF")])
(define_mode_attr avxhalfvecmode
[(V4SF "V2SF") (V32QI "V16QI") (V16HI "V8HI") (V8SI "V4SI")
(V4DI "V2DI") (V8SF "V4SF") (V4DF "V2DF")])
[(V32QI "V16QI") (V16HI "V8HI") (V8SI "V4SI") (V4DI "V2DI")
(V8SF "V4SF") (V4DF "V2DF")
(V16QI "V8QI") (V8HI "V4HI") (V4SI "V2SI") (V4SF "V2SF")])
(define_mode_attr avxscalarmode
[(V16QI "QI") (V8HI "HI") (V4SI "SI") (V4SF "SF") (V2DF "DF")
(V8SF "SF") (V4DF "DF")])
[(V16QI "QI") (V8HI "HI") (V4SI "SI") (V2DI "DI") (V4SF "SF") (V2DF "DF")
(V32QI "QI") (V16HI "HI") (V8SI "SI") (V4DI "DI") (V8SF "SF") (V4DF "DF")])
(define_mode_attr avxcvtvecmode
[(V4SF "V4SI") (V8SF "V8SI") (V4SI "V4SF") (V8SI "V8SF")])
(define_mode_attr avxpermvecmode
[(V2DF "V2DI") (V4SF "V4SI") (V4DF "V4DI") (V8SF "V8SI")])
(define_mode_attr avxmodesuffixf2c
[(V4SF "s") (V2DF "d") (V8SF "s") (V4DF "d")])
[(V4SF "s") (V2DF "d") (V8SI "s") (V8SF "s") (V4DI "d") (V4DF "d")])
(define_mode_attr avxmodesuffixp
[(V2DF "pd") (V4SI "si") (V4SF "ps") (V8SF "ps") (V8SI "si")
(V4DF "pd")])
@ -4012,14 +4016,27 @@
[(set_attr "type" "ssemov")
(set_attr "mode" "SF")])
(define_insn "*vec_dupv4sf_avx"
[(set (match_operand:V4SF 0 "register_operand" "=x")
(define_expand "vec_dupv4sf"
[(set (match_operand:V4SF 0 "register_operand" "")
(vec_duplicate:V4SF
(match_operand:SF 1 "register_operand" "x")))]
(match_operand:SF 1 "nonimmediate_operand" "")))]
"TARGET_SSE"
{
if (!TARGET_AVX)
operands[1] = force_reg (V4SFmode, operands[1]);
})
(define_insn "*vec_dupv4sf_avx"
[(set (match_operand:V4SF 0 "register_operand" "=x,x")
(vec_duplicate:V4SF
(match_operand:SF 1 "nonimmediate_operand" "x,m")))]
"TARGET_AVX"
"vshufps\t{$0, %1, %1, %0|%0, %1, %1, 0}"
[(set_attr "type" "sselog1")
(set_attr "length_immediate" "1")
"@
vshufps\t{$0, %1, %1, %0|%0, %1, %1, 0}
vbroadcastss\t{%1, %0|%0, %1}"
[(set_attr "type" "sselog1,ssemov")
(set_attr "length_immediate" "1,0")
(set_attr "prefix_extra" "0,1")
(set_attr "prefix" "vex")
(set_attr "mode" "V4SF")])
@ -4125,35 +4142,78 @@
DONE;
})
(define_insn "*vec_setv4sf_0_avx"
[(set (match_operand:V4SF 0 "nonimmediate_operand" "=x,x,x,m")
(vec_merge:V4SF
(vec_duplicate:V4SF
(match_operand:SF 2 "general_operand" " x,m,*r,x*rfF"))
(match_operand:V4SF 1 "vector_move_operand" " x,C,C ,0")
(define_insn "*vec_set<mode>_0_avx"
[(set (match_operand:SSEMODE4S 0 "nonimmediate_operand" "=x,x, x,x, x,m")
(vec_merge:SSEMODE4S
(vec_duplicate:SSEMODE4S
(match_operand:<ssescalarmode> 2
"general_operand" " x,m,*r,x,*rm,x*rfF"))
(match_operand:SSEMODE4S 1 "vector_move_operand" " C,C, C,x, x,0")
(const_int 1)))]
"TARGET_AVX"
"@
vmovss\t{%2, %1, %0|%0, %1, %2}
vmovss\t{%2, %0|%0, %2}
vinsertps\t{$0xe, %2, %2, %0|%0, %2, %2, 0xe}
vmov<ssescalarmodesuffix2s>\t{%2, %0|%0, %2}
vmovd\t{%2, %0|%0, %2}
vmovss\t{%2, %1, %0|%0, %1, %2}
vpinsrd\t{$0, %2, %1, %0|%0, %1, %2, 0}
#"
[(set_attr "type" "sselog,ssemov,ssemov,ssemov,sselog,*")
(set_attr "prefix_extra" "*,*,*,*,1,*")
(set_attr "length_immediate" "*,*,*,*,1,*")
(set_attr "prefix" "vex")
(set_attr "mode" "SF,<ssescalarmode>,SI,SF,TI,*")])
(define_insn "*vec_set<mode>_0_sse4_1"
[(set (match_operand:SSEMODE4S 0 "nonimmediate_operand" "=x,x, x,x, x,m")
(vec_merge:SSEMODE4S
(vec_duplicate:SSEMODE4S
(match_operand:<ssescalarmode> 2
"general_operand" " x,m,*r,x,*rm,*rfF"))
(match_operand:SSEMODE4S 1 "vector_move_operand" " C,C, C,0, 0,0")
(const_int 1)))]
"TARGET_SSE4_1"
"@
insertps\t{$0xe, %2, %0|%0, %2, 0xe}
mov<ssescalarmodesuffix2s>\t{%2, %0|%0, %2}
movd\t{%2, %0|%0, %2}
movss\t{%2, %0|%0, %2}
pinsrd\t{$0, %2, %0|%0, %2, 0}
#"
[(set_attr "type" "sselog,ssemov,ssemov,ssemov,sselog,*")
(set_attr "prefix_extra" "*,*,*,*,1,*")
(set_attr "length_immediate" "*,*,*,*,1,*")
(set_attr "mode" "SF,<ssescalarmode>,SI,SF,TI,*")])
(define_insn "*vec_set<mode>_0_sse2"
[(set (match_operand:SSEMODE4S 0 "nonimmediate_operand" "=x, x,x,m")
(vec_merge:SSEMODE4S
(vec_duplicate:SSEMODE4S
(match_operand:<ssescalarmode> 2
"general_operand" " m,*r,x,x*rfF"))
(match_operand:SSEMODE4S 1 "vector_move_operand" " C, C,0,0")
(const_int 1)))]
"TARGET_SSE2"
"@
mov<ssescalarmodesuffix2s>\t{%2, %0|%0, %2}
movd\t{%2, %0|%0, %2}
movss\t{%2, %0|%0, %2}
#"
[(set_attr "type" "ssemov")
(set_attr "prefix" "vex")
(set_attr "mode" "SF")])
(set_attr "mode" "<ssescalarmode>,SI,SF,*")])
(define_insn "vec_setv4sf_0"
[(set (match_operand:V4SF 0 "nonimmediate_operand" "=x,x,Y2,m")
(vec_merge:V4SF
(vec_duplicate:V4SF
(match_operand:SF 2 "general_operand" " x,m,*r,x*rfF"))
(match_operand:V4SF 1 "vector_move_operand" " 0,C,C ,0")
(define_insn "vec_set<mode>_0"
[(set (match_operand:SSEMODE4S 0 "nonimmediate_operand" "=x,x,m")
(vec_merge:SSEMODE4S
(vec_duplicate:SSEMODE4S
(match_operand:<ssescalarmode> 2
"general_operand" " m,x,x*rfF"))
(match_operand:SSEMODE4S 1 "vector_move_operand" " C,0,0")
(const_int 1)))]
"TARGET_SSE"
"@
movss\t{%2, %0|%0, %2}
movss\t{%2, %0|%0, %2}
movd\t{%2, %0|%0, %2}
#"
[(set_attr "type" "ssemov")
(set_attr "mode" "SF")])
@ -4484,7 +4544,7 @@
(set_attr "mode" "V4DF")])
(define_expand "vec_interleave_highv2df"
[(set (match_operand:V2DF 0 "nonimmediate_operand" "")
[(set (match_operand:V2DF 0 "register_operand" "")
(vec_select:V2DF
(vec_concat:V4DF
(match_operand:V2DF 1 "nonimmediate_operand" "")
@ -4492,24 +4552,46 @@
(parallel [(const_int 1)
(const_int 3)])))]
"TARGET_SSE2"
"ix86_fixup_binary_operands (UNKNOWN, V2DFmode, operands);")
{
if (!ix86_vec_interleave_v2df_operator_ok (operands, 1))
operands[2] = force_reg (V2DFmode, operands[2]);
})
(define_insn "*avx_interleave_highv2df"
[(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,m")
[(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,x,m")
(vec_select:V2DF
(vec_concat:V4DF
(match_operand:V2DF 1 "nonimmediate_operand" " x,o,x")
(match_operand:V2DF 2 "nonimmediate_operand" " x,x,0"))
(match_operand:V2DF 1 "nonimmediate_operand" " x,o,o,x")
(match_operand:V2DF 2 "nonimmediate_operand" " x,1,x,0"))
(parallel [(const_int 1)
(const_int 3)])))]
"TARGET_AVX && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
"TARGET_AVX && ix86_vec_interleave_v2df_operator_ok (operands, 1)"
"@
vunpckhpd\t{%2, %1, %0|%0, %1, %2}
vmovddup\t{%H1, %0|%0, %H1}
vmovlpd\t{%H1, %2, %0|%0, %2, %H1}
vmovhpd\t{%1, %0|%0, %1}"
[(set_attr "type" "sselog,ssemov,ssemov")
[(set_attr "type" "sselog,sselog,ssemov,ssemov")
(set_attr "prefix" "vex")
(set_attr "mode" "V2DF,V1DF,V1DF")])
(set_attr "mode" "V2DF,V2DF,V1DF,V1DF")])
(define_insn "*sse3_interleave_highv2df"
[(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,x,m")
(vec_select:V2DF
(vec_concat:V4DF
(match_operand:V2DF 1 "nonimmediate_operand" " 0,o,o,x")
(match_operand:V2DF 2 "nonimmediate_operand" " x,1,0,0"))
(parallel [(const_int 1)
(const_int 3)])))]
"TARGET_SSE3 && ix86_vec_interleave_v2df_operator_ok (operands, 1)"
"@
unpckhpd\t{%2, %0|%0, %2}
movddup\t{%H1, %0|%0, %H1}
movlpd\t{%H1, %0|%0, %H1}
movhpd\t{%1, %0|%0, %1}"
[(set_attr "type" "sselog,sselog,ssemov,ssemov")
(set_attr "prefix_data16" "*,*,1,1")
(set_attr "mode" "V2DF,V2DF,V1DF,V1DF")])
(define_insn "*sse2_interleave_highv2df"
[(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,m")
@ -4519,7 +4601,7 @@
(match_operand:V2DF 2 "nonimmediate_operand" " x,0,0"))
(parallel [(const_int 1)
(const_int 3)])))]
"TARGET_SSE2 && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
"TARGET_SSE2 && ix86_vec_interleave_v2df_operator_ok (operands, 1)"
"@
unpckhpd\t{%2, %0|%0, %2}
movlpd\t{%H1, %0|%0, %H1}
@ -4528,50 +4610,112 @@
(set_attr "prefix_data16" "*,1,1")
(set_attr "mode" "V2DF,V1DF,V1DF")])
(define_insn "avx_movddup256"
[(set (match_operand:V4DF 0 "register_operand" "=x")
;; Recall that the 256-bit unpck insns only shuffle within their lanes.
(define_expand "avx_movddup256"
[(set (match_operand:V4DF 0 "register_operand" "")
(vec_select:V4DF
(vec_concat:V8DF
(match_operand:V4DF 1 "nonimmediate_operand" "xm")
(match_operand:V4DF 1 "nonimmediate_operand" "")
(match_dup 1))
(parallel [(const_int 0) (const_int 2)
(const_int 4) (const_int 6)])))]
(parallel [(const_int 0) (const_int 4)
(const_int 2) (const_int 6)])))]
"TARGET_AVX"
"vmovddup\t{%1, %0|%0, %1}"
[(set_attr "type" "sselog1")
"")
(define_expand "avx_unpcklpd256"
[(set (match_operand:V4DF 0 "register_operand" "")
(vec_select:V4DF
(vec_concat:V8DF
(match_operand:V4DF 1 "register_operand" "")
(match_operand:V4DF 2 "nonimmediate_operand" ""))
(parallel [(const_int 0) (const_int 4)
(const_int 2) (const_int 6)])))]
"TARGET_AVX"
"")
(define_insn "*avx_unpcklpd256"
[(set (match_operand:V4DF 0 "register_operand" "=x,x")
(vec_select:V4DF
(vec_concat:V8DF
(match_operand:V4DF 1 "nonimmediate_operand" "xm,x")
(match_operand:V4DF 2 "nonimmediate_operand" " 1,xm"))
(parallel [(const_int 0) (const_int 4)
(const_int 2) (const_int 6)])))]
"TARGET_AVX
&& (!MEM_P (operands[1]) || rtx_equal_p (operands[1], operands[2]))"
"@
vmovddup\t{%1, %0|%0, %1}
vunpcklpd\t{%2, %1, %0|%0, %1, %2}"
[(set_attr "type" "sselog")
(set_attr "prefix" "vex")
(set_attr "mode" "V4DF")])
(define_insn "*avx_movddup"
[(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,o")
(define_expand "vec_interleave_lowv2df"
[(set (match_operand:V2DF 0 "register_operand" "")
(vec_select:V2DF
(vec_concat:V4DF
(match_operand:V2DF 1 "nonimmediate_operand" "xm,x")
(match_dup 1))
(match_operand:V2DF 1 "nonimmediate_operand" "")
(match_operand:V2DF 2 "nonimmediate_operand" ""))
(parallel [(const_int 0)
(const_int 2)])))]
"TARGET_AVX && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
"@
vmovddup\t{%1, %0|%0, %1}
#"
[(set_attr "type" "sselog1,ssemov")
(set_attr "prefix" "vex")
(set_attr "mode" "V2DF")])
"TARGET_SSE2"
{
if (!ix86_vec_interleave_v2df_operator_ok (operands, 0))
operands[1] = force_reg (V2DFmode, operands[1]);
})
(define_insn "*sse3_movddup"
[(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,o")
(define_insn "*avx_interleave_lowv2df"
[(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,x,o")
(vec_select:V2DF
(vec_concat:V4DF
(match_operand:V2DF 1 "nonimmediate_operand" "xm,x")
(match_dup 1))
(match_operand:V2DF 1 "nonimmediate_operand" " x,m,x,0")
(match_operand:V2DF 2 "nonimmediate_operand" " x,1,m,x"))
(parallel [(const_int 0)
(const_int 2)])))]
"TARGET_SSE3 && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
"TARGET_AVX && ix86_vec_interleave_v2df_operator_ok (operands, 0)"
"@
vunpcklpd\t{%2, %1, %0|%0, %1, %2}
vmovddup\t{%1, %0|%0, %1}
vmovhpd\t{%2, %1, %0|%0, %1, %2}
vmovlpd\t{%2, %H0|%H0, %2}"
[(set_attr "type" "sselog,sselog,ssemov,ssemov")
(set_attr "prefix" "vex")
(set_attr "mode" "V2DF,V2DF,V1DF,V1DF")])
(define_insn "*sse3_interleave_lowv2df"
[(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,x,o")
(vec_select:V2DF
(vec_concat:V4DF
(match_operand:V2DF 1 "nonimmediate_operand" " 0,m,0,0")
(match_operand:V2DF 2 "nonimmediate_operand" " x,1,m,x"))
(parallel [(const_int 0)
(const_int 2)])))]
"TARGET_SSE3 && ix86_vec_interleave_v2df_operator_ok (operands, 0)"
"@
unpcklpd\t{%2, %0|%0, %2}
movddup\t{%1, %0|%0, %1}
#"
[(set_attr "type" "sselog1,ssemov")
(set_attr "mode" "V2DF")])
movhpd\t{%2, %0|%0, %2}
movlpd\t{%2, %H0|%H0, %2}"
[(set_attr "type" "sselog,sselog,ssemov,ssemov")
(set_attr "prefix_data16" "*,*,1,1")
(set_attr "mode" "V2DF,V2DF,V1DF,V1DF")])
(define_insn "*sse2_interleave_lowv2df"
[(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,o")
(vec_select:V2DF
(vec_concat:V4DF
(match_operand:V2DF 1 "nonimmediate_operand" " 0,0,0")
(match_operand:V2DF 2 "nonimmediate_operand" " x,m,x"))
(parallel [(const_int 0)
(const_int 2)])))]
"TARGET_SSE2 && ix86_vec_interleave_v2df_operator_ok (operands, 0)"
"@
unpcklpd\t{%2, %0|%0, %2}
movhpd\t{%2, %0|%0, %2}
movlpd\t{%2, %H0|%H0, %2}"
[(set_attr "type" "sselog,ssemov,ssemov")
(set_attr "prefix_data16" "*,1,1")
(set_attr "mode" "V2DF,V1DF,V1DF")])
(define_split
[(set (match_operand:V2DF 0 "memory_operand" "")
@ -4590,65 +4734,19 @@
DONE;
})
;; Recall that the 256-bit unpck insns only shuffle within their lanes.
(define_insn "avx_unpcklpd256"
[(set (match_operand:V4DF 0 "register_operand" "=x")
(vec_select:V4DF
(vec_concat:V8DF
(match_operand:V4DF 1 "register_operand" "x")
(match_operand:V4DF 2 "nonimmediate_operand" "xm"))
(parallel [(const_int 0) (const_int 4)
(const_int 2) (const_int 6)])))]
"TARGET_AVX"
"vunpcklpd\t{%2, %1, %0|%0, %1, %2}"
[(set_attr "type" "sselog")
(set_attr "prefix" "vex")
(set_attr "mode" "V4DF")])
(define_expand "vec_interleave_lowv2df"
[(set (match_operand:V2DF 0 "nonimmediate_operand" "")
(define_split
[(set (match_operand:V2DF 0 "register_operand" "")
(vec_select:V2DF
(vec_concat:V4DF
(match_operand:V2DF 1 "nonimmediate_operand" "")
(match_operand:V2DF 2 "nonimmediate_operand" ""))
(parallel [(const_int 0)
(const_int 2)])))]
"TARGET_SSE2"
"ix86_fixup_binary_operands (UNKNOWN, V2DFmode, operands);")
(define_insn "*avx_interleave_lowv2df"
[(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,o")
(vec_select:V2DF
(vec_concat:V4DF
(match_operand:V2DF 1 "nonimmediate_operand" " x,x,0")
(match_operand:V2DF 2 "nonimmediate_operand" " x,m,x"))
(parallel [(const_int 0)
(const_int 2)])))]
"TARGET_AVX && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
"@
vunpcklpd\t{%2, %1, %0|%0, %1, %2}
vmovhpd\t{%2, %1, %0|%0, %1, %2}
vmovlpd\t{%2, %H0|%H0, %2}"
[(set_attr "type" "sselog,ssemov,ssemov")
(set_attr "prefix" "vex")
(set_attr "mode" "V2DF,V1DF,V1DF")])
(define_insn "*sse2_interleave_lowv2df"
[(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,o")
(vec_select:V2DF
(vec_concat:V4DF
(match_operand:V2DF 1 "nonimmediate_operand" " 0,0,0")
(match_operand:V2DF 2 "nonimmediate_operand" " x,m,x"))
(parallel [(const_int 0)
(const_int 2)])))]
"TARGET_SSE2 && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
"@
unpcklpd\t{%2, %0|%0, %2}
movhpd\t{%2, %0|%0, %2}
movlpd\t{%2, %H0|%H0, %2}"
[(set_attr "type" "sselog,ssemov,ssemov")
(set_attr "prefix_data16" "*,1,1")
(set_attr "mode" "V2DF,V1DF,V1DF")])
(match_operand:V2DF 1 "memory_operand" "")
(match_dup 1))
(parallel [(match_operand:SI 2 "const_0_to_1_operand" "")
(match_operand:SI 3 "const_int_operand" "")])))]
"TARGET_SSE3 && INTVAL (operands[2]) + 2 == INTVAL (operands[3])"
[(set (match_dup 0) (vec_duplicate:V2DF (match_dup 1)))]
{
operands[1] = adjust_address (operands[1], DFmode, INTVAL (operands[2]) * 8);
})
(define_expand "avx_shufpd256"
[(match_operand:V4DF 0 "register_operand" "")
@ -7408,6 +7506,20 @@
[(set_attr "type" "ssemov")
(set_attr "mode" "V2SF,V4SF,V2SF")])
(define_insn "*vec_dupv4si_avx"
[(set (match_operand:V4SI 0 "register_operand" "=x,x")
(vec_duplicate:V4SI
(match_operand:SI 1 "register_operand" "x,m")))]
"TARGET_AVX"
"@
vpshufd\t{$0, %1, %0|%0, %1, 0}
vbroadcastss\t{%1, %0|%0, %1}"
[(set_attr "type" "sselog1,ssemov")
(set_attr "length_immediate" "1,0")
(set_attr "prefix_extra" "0,1")
(set_attr "prefix" "vex")
(set_attr "mode" "TI,V4SF")])
(define_insn "*vec_dupv4si"
[(set (match_operand:V4SI 0 "register_operand" "=Y2,x")
(vec_duplicate:V4SI
@ -7417,19 +7529,31 @@
%vpshufd\t{$0, %1, %0|%0, %1, 0}
shufps\t{$0, %0, %0|%0, %0, 0}"
[(set_attr "type" "sselog1")
(set_attr "prefix" "maybe_vex,orig")
(set_attr "length_immediate" "1")
(set_attr "mode" "TI,V4SF")])
(define_insn "*vec_dupv2di_avx"
[(set (match_operand:V2DI 0 "register_operand" "=x")
[(set (match_operand:V2DI 0 "register_operand" "=x,x")
(vec_duplicate:V2DI
(match_operand:DI 1 "register_operand" "x")))]
(match_operand:DI 1 "nonimmediate_operand" " x,m")))]
"TARGET_AVX"
"vpunpcklqdq\t{%1, %1, %0|%0, %1, %1}"
"@
vpunpcklqdq\t{%1, %1, %0|%0, %1, %1}
vmovddup\t{%1, %0|%0, %1}"
[(set_attr "type" "sselog1")
(set_attr "prefix" "vex")
(set_attr "mode" "TI")])
(set_attr "mode" "TI,DF")])
(define_insn "*vec_dupv2di_sse3"
[(set (match_operand:V2DI 0 "register_operand" "=x,x")
(vec_duplicate:V2DI
(match_operand:DI 1 "nonimmediate_operand" " 0,m")))]
"TARGET_SSE3"
"@
punpcklqdq\t%0, %0
movddup\t{%1, %0|%0, %1}"
[(set_attr "type" "sselog1")
(set_attr "mode" "TI,DF")])
(define_insn "*vec_dupv2di"
[(set (match_operand:V2DI 0 "register_operand" "=Y2,x")
@ -11838,6 +11962,108 @@
(set_attr "prefix" "vex")
(set_attr "mode" "OI")])
(define_insn_and_split "vec_dup<mode>"
[(set (match_operand:AVX256MODE24P 0 "register_operand" "=x,x")
(vec_duplicate:AVX256MODE24P
(match_operand:<avxscalarmode> 1 "nonimmediate_operand" "m,?x")))]
"TARGET_AVX"
"@
vbroadcasts<avxmodesuffixf2c>\t{%1, %0|%0, %1}
#"
"&& reload_completed && REG_P (operands[1])"
[(set (match_dup 2) (vec_duplicate:<avxhalfvecmode> (match_dup 1)))
(set (match_dup 0) (vec_concat:AVX256MODE24P (match_dup 2) (match_dup 2)))]
{
operands[2] = gen_rtx_REG (<avxhalfvecmode>mode, REGNO (operands[0]));
}
[(set_attr "type" "ssemov")
(set_attr "prefix_extra" "1")
(set_attr "prefix" "vex")
(set_attr "mode" "V8SF")])
(define_insn "avx_vbroadcastf128_<mode>"
[(set (match_operand:AVX256MODE 0 "register_operand" "=x,x,x")
(vec_concat:AVX256MODE
(match_operand:<avxhalfvecmode> 1 "nonimmediate_operand" "m,0,?x")
(match_dup 1)))]
"TARGET_AVX"
"@
vbroadcastf128\t{%1, %0|%0, %1}
vinsertf128\t{$1, %1, %0, %0|%0, %0, %1, 1}
vperm2f128\t{$0, %t1, %t1, %0|%0, %t1, %t1, 0}"
[(set_attr "type" "ssemov,sselog1,sselog1")
(set_attr "prefix_extra" "1")
(set_attr "length_immediate" "0,1,1")
(set_attr "prefix" "vex")
(set_attr "mode" "V4SF,V8SF,V8SF")])
;; Recognize broadcast as a vec_select as produced by builtin_vec_perm.
;; If it so happens that the input is in memory, use vbroadcast.
;; Otherwise use vpermilp (and in the case of 256-bit modes, vperm2f128).
(define_insn "*avx_vperm_broadcast_v4sf"
[(set (match_operand:V4SF 0 "register_operand" "=x,x,x")
(vec_select:V4SF
(match_operand:V4SF 1 "nonimmediate_operand" "m,o,x")
(match_parallel 2 "avx_vbroadcast_operand"
[(match_operand 3 "const_int_operand" "C,n,n")])))]
"TARGET_AVX"
{
int elt = INTVAL (operands[3]);
switch (which_alternative)
{
case 0:
case 1:
operands[1] = adjust_address_nv (operands[1], SFmode, elt * 4);
return "vbroadcastss\t{%1, %0|%0, %1}";
case 2:
operands[2] = GEN_INT (elt * 0x55);
return "vpermilps\t{%2, %1, %0|%0, %1, %2}";
default:
gcc_unreachable ();
}
}
[(set_attr "type" "ssemov,ssemov,sselog1")
(set_attr "prefix_extra" "1")
(set_attr "length_immediate" "0,0,1")
(set_attr "prefix" "vex")
(set_attr "mode" "SF,SF,V4SF")])
(define_insn_and_split "*avx_vperm_broadcast_<mode>"
[(set (match_operand:AVX256MODEF2P 0 "register_operand" "=x,x,x")
(vec_select:AVX256MODEF2P
(match_operand:AVX256MODEF2P 1 "nonimmediate_operand" "m,o,?x")
(match_parallel 2 "avx_vbroadcast_operand"
[(match_operand 3 "const_int_operand" "C,n,n")])))]
"TARGET_AVX"
"#"
"&& reload_completed"
[(set (match_dup 0) (vec_duplicate:AVX256MODEF2P (match_dup 1)))]
{
rtx op0 = operands[0], op1 = operands[1];
int elt = INTVAL (operands[3]);
if (REG_P (op1))
{
int mask;
/* Shuffle element we care about into all elements of the 128-bit lane.
The other lane gets shuffled too, but we don't care. */
if (<MODE>mode == V4DFmode)
mask = (elt & 1 ? 15 : 0);
else
mask = (elt & 3) * 0x55;
emit_insn (gen_avx_vpermil<mode> (op0, op1, GEN_INT (mask)));
/* Shuffle the lane we care about into both lanes of the dest. */
mask = (elt / (<ssescalarnum> / 2)) * 0x11;
emit_insn (gen_avx_vperm2f128<mode>3 (op0, op0, op0, GEN_INT (mask)));
DONE;
}
operands[1] = adjust_address_nv (op1, <avxscalarmode>mode,
elt * GET_MODE_SIZE (<avxscalarmode>mode));
})
(define_expand "avx_vpermil<mode>"
[(set (match_operand:AVXMODEFDP 0 "register_operand" "")
(vec_select:AVXMODEFDP
@ -11989,58 +12215,6 @@
(set_attr "prefix" "vex")
(set_attr "mode" "V8SF")])
(define_insn "avx_vbroadcasts<avxmodesuffixf2c><avxmodesuffix>"
[(set (match_operand:AVXMODEF4P 0 "register_operand" "=x")
(vec_concat:AVXMODEF4P
(vec_concat:<avxhalfvecmode>
(match_operand:<avxscalarmode> 1 "memory_operand" "m")
(match_dup 1))
(vec_concat:<avxhalfvecmode>
(match_dup 1)
(match_dup 1))))]
"TARGET_AVX"
"vbroadcasts<avxmodesuffixf2c>\t{%1, %0|%0, %1}"
[(set_attr "type" "ssemov")
(set_attr "prefix_extra" "1")
(set_attr "prefix" "vex")
(set_attr "mode" "<avxscalarmode>")])
(define_insn "avx_vbroadcastss256"
[(set (match_operand:V8SF 0 "register_operand" "=x")
(vec_concat:V8SF
(vec_concat:V4SF
(vec_concat:V2SF
(match_operand:SF 1 "memory_operand" "m")
(match_dup 1))
(vec_concat:V2SF
(match_dup 1)
(match_dup 1)))
(vec_concat:V4SF
(vec_concat:V2SF
(match_dup 1)
(match_dup 1))
(vec_concat:V2SF
(match_dup 1)
(match_dup 1)))))]
"TARGET_AVX"
"vbroadcastss\t{%1, %0|%0, %1}"
[(set_attr "type" "ssemov")
(set_attr "prefix_extra" "1")
(set_attr "prefix" "vex")
(set_attr "mode" "SF")])
(define_insn "avx_vbroadcastf128_p<avxmodesuffixf2c>256"
[(set (match_operand:AVX256MODEF2P 0 "register_operand" "=x")
(vec_concat:AVX256MODEF2P
(match_operand:<avxhalfvecmode> 1 "memory_operand" "m")
(match_dup 1)))]
"TARGET_AVX"
"vbroadcastf128\t{%1, %0|%0, %1}"
[(set_attr "type" "ssemov")
(set_attr "prefix_extra" "1")
(set_attr "prefix" "vex")
(set_attr "mode" "V4SF")])
(define_expand "avx_vinsertf128<mode>"
[(match_operand:AVX256MODE 0 "register_operand" "")
(match_operand:AVX256MODE 1 "register_operand" "")