mirror of
git://gcc.gnu.org/git/gcc.git
synced 2025-04-04 04:40:27 +08:00
Implement vec_perm broadcast, and tidy lots of patterns to help.
From-SVN: r154836
This commit is contained in:
parent
9fda11a2ec
commit
5e04b3b694
@ -1,3 +1,50 @@
|
||||
2009-11-30 Richard Henderson <rth@redhat.com>
|
||||
|
||||
* config/i386/i386.c (ix86_vec_interleave_v2df_operator_ok): New.
|
||||
(bdesc_special_args): Update insn codes.
|
||||
(avx_vpermilp_parallel): Correct range check.
|
||||
(ix86_rtx_costs): Handle vector permutation rtx codes.
|
||||
(struct expand_vec_perm_d): Move earlier.
|
||||
(get_mode_wider_vector): New.
|
||||
(expand_vec_perm_broadcast_1): New.
|
||||
(ix86_expand_vector_init_duplicate): Use it. Tidy AVX modes.
|
||||
(expand_vec_perm_broadcast): New.
|
||||
(ix86_expand_vec_perm_builtin_1): Use it.
|
||||
* config/i386/i386-protos.h: Update.
|
||||
* config/i386/predicates.md (avx_vbroadcast_operand): New.
|
||||
* config/i386/sse.md (AVX256MODE24P): New.
|
||||
(ssescalarmodesuffix2s): New.
|
||||
(avxhalfvecmode, avxscalarmode): Fill out to all modes.
|
||||
(avxmodesuffixf2c): Add V8SI, V4DI.
|
||||
(vec_dupv4sf): New expander.
|
||||
(*vec_dupv4sf_avx): Add vbroadcastss alternative.
|
||||
(*vec_set<mode>_0_avx, **vec_set<mode>_0_sse4_1): Macro-ize for
|
||||
V4SF and V4SI. Move C alternatives to front. Add insertps and
|
||||
pinsrd alternatives.
|
||||
(*vec_set<mode>_0_sse2): Split out from ...
|
||||
(vec_set<mode>_0): Macro-ize for V4SF and V4SI.
|
||||
(vec_interleave_highv2df, vec_interleave_lowv2df): Require register
|
||||
destination; use ix86_vec_interleave_v2df_operator_ok, instead of
|
||||
ix86_fixup_binary_operands.
|
||||
(*avx_interleave_highv2df, avx_interleave_lowv2df): Add movddup.
|
||||
(*sse3_interleave_highv2df, sse3_interleave_lowv2df): New.
|
||||
(*avx_movddup, *sse3_movddup): Remove. New splitter from
|
||||
vec_select form to vec_duplicate form.
|
||||
(*sse2_interleave_highv2df, sse2_interleave_lowv2df): Use
|
||||
ix86_vec_interleave_v2df_operator_ok.
|
||||
(avx_movddup256, avx_unpcklpd256): Change to expanders, merge into ...
|
||||
(*avx_unpcklpd256): ... here.
|
||||
(*vec_dupv4si_avx): New.
|
||||
(*vec_dupv2di_avx): Add movddup alternative.
|
||||
(*vec_dupv2di_sse3): New.
|
||||
(vec_dup<AVX256MODE24P>): Replace avx_vbroadcasts<AVXMODEF4P> and
|
||||
avx_vbroadcastss256; represent with vec_duplicate instead of
|
||||
nested vec_concat operations.
|
||||
(avx_vbroadcastf128_<mode>): Rename from
|
||||
avx_vbroadcastf128_p<avxmodesuffixf2c>256.
|
||||
(*avx_vperm_broadcast_v4sf): New.
|
||||
(*avx_vperm_broadcast_<AVX256MODEF2P>): New.
|
||||
|
||||
2009-11-30 Martin Jambor <mjambor@suse.cz>
|
||||
|
||||
PR middle-end/42196
|
||||
|
@ -86,6 +86,7 @@ extern void ix86_expand_binary_operator (enum rtx_code,
|
||||
enum machine_mode, rtx[]);
|
||||
extern int ix86_binary_operator_ok (enum rtx_code, enum machine_mode, rtx[]);
|
||||
extern bool ix86_lea_for_add_ok (enum rtx_code, rtx, rtx[]);
|
||||
extern bool ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high);
|
||||
extern bool ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn);
|
||||
extern bool ix86_agi_dependent (rtx set_insn, rtx use_insn);
|
||||
extern void ix86_expand_unary_operator (enum rtx_code, enum machine_mode,
|
||||
|
@ -13849,6 +13849,19 @@ ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
/* Return TRUE if the operands to a vec_interleave_{high,low}v2df
|
||||
are ok, keeping in mind the possible movddup alternative. */
|
||||
|
||||
bool
|
||||
ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
|
||||
{
|
||||
if (MEM_P (operands[0]))
|
||||
return rtx_equal_p (operands[0], operands[1 + high]);
|
||||
if (MEM_P (operands[1]) && MEM_P (operands[2]))
|
||||
return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Post-reload splitter for converting an SF or DFmode value in an
|
||||
SSE register into an unsigned SImode. */
|
||||
|
||||
@ -21480,11 +21493,11 @@ static const struct builtin_description bdesc_special_args[] =
|
||||
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
|
||||
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
|
||||
|
||||
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastss, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
|
||||
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastsd256, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
|
||||
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastss256, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
|
||||
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_pd256, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
|
||||
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_ps256, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
|
||||
{ OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
|
||||
{ OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
|
||||
{ OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
|
||||
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
|
||||
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
|
||||
|
||||
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
|
||||
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
|
||||
@ -24597,7 +24610,7 @@ avx_vpermilp_parallel (rtx par, enum machine_mode mode)
|
||||
if (!CONST_INT_P (er))
|
||||
return 0;
|
||||
ei = INTVAL (er);
|
||||
if (ei >= nelt)
|
||||
if (ei >= 2 * nelt)
|
||||
return 0;
|
||||
ipar[i] = ei;
|
||||
}
|
||||
@ -25713,6 +25726,16 @@ ix86_rtx_costs (rtx x, int code, int outer_code_i, int *total, bool speed)
|
||||
*total = 0;
|
||||
return false;
|
||||
|
||||
case VEC_SELECT:
|
||||
case VEC_CONCAT:
|
||||
case VEC_MERGE:
|
||||
case VEC_DUPLICATE:
|
||||
/* ??? Assume all of these vector manipulation patterns are
|
||||
recognizable. In which case they all pretty much have the
|
||||
same cost. */
|
||||
*total = COSTS_N_INSNS (1);
|
||||
return true;
|
||||
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@ -26547,16 +26570,43 @@ x86_emit_floatuns (rtx operands[2])
|
||||
emit_label (donelab);
|
||||
}
|
||||
|
||||
/* AVX does not support 32-byte integer vector operations,
|
||||
thus the longest vector we are faced with is V16QImode. */
|
||||
#define MAX_VECT_LEN 16
|
||||
|
||||
struct expand_vec_perm_d
|
||||
{
|
||||
rtx target, op0, op1;
|
||||
unsigned char perm[MAX_VECT_LEN];
|
||||
enum machine_mode vmode;
|
||||
unsigned char nelt;
|
||||
bool testing_p;
|
||||
};
|
||||
|
||||
static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
|
||||
static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
|
||||
|
||||
/* Get a vector mode of the same size as the original but with elements
|
||||
twice as wide. This is only guaranteed to apply to integral vectors. */
|
||||
|
||||
static inline enum machine_mode
|
||||
get_mode_wider_vector (enum machine_mode o)
|
||||
{
|
||||
/* ??? Rely on the ordering that genmodes.c gives to vectors. */
|
||||
enum machine_mode n = GET_MODE_WIDER_MODE (o);
|
||||
gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
|
||||
gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
|
||||
return n;
|
||||
}
|
||||
|
||||
/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
|
||||
with all elements equal to VAR. Return true if successful. */
|
||||
/* ??? Call into the vec_perm support to implement the broadcast. */
|
||||
|
||||
static bool
|
||||
ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
|
||||
rtx target, rtx val)
|
||||
{
|
||||
enum machine_mode hmode, smode, wsmode, wvmode;
|
||||
rtx x;
|
||||
bool ok;
|
||||
|
||||
switch (mode)
|
||||
{
|
||||
@ -26566,13 +26616,28 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
|
||||
return false;
|
||||
/* FALLTHRU */
|
||||
|
||||
case V4DFmode:
|
||||
case V4DImode:
|
||||
case V8SFmode:
|
||||
case V8SImode:
|
||||
case V2DFmode:
|
||||
case V2DImode:
|
||||
case V4SFmode:
|
||||
case V4SImode:
|
||||
val = force_reg (GET_MODE_INNER (mode), val);
|
||||
x = gen_rtx_VEC_DUPLICATE (mode, val);
|
||||
emit_insn (gen_rtx_SET (VOIDmode, target, x));
|
||||
{
|
||||
rtx insn, dup;
|
||||
|
||||
/* First attempt to recognize VAL as-is. */
|
||||
dup = gen_rtx_VEC_DUPLICATE (mode, val);
|
||||
insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
|
||||
if (recog_memoized (insn) < 0)
|
||||
{
|
||||
/* If that fails, force VAL into a register. */
|
||||
XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
|
||||
ok = recog_memoized (insn) >= 0;
|
||||
gcc_assert (ok);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
|
||||
case V4HImode:
|
||||
@ -26580,130 +26645,87 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
|
||||
return false;
|
||||
if (TARGET_SSE || TARGET_3DNOW_A)
|
||||
{
|
||||
rtx x;
|
||||
|
||||
val = gen_lowpart (SImode, val);
|
||||
x = gen_rtx_TRUNCATE (HImode, val);
|
||||
x = gen_rtx_VEC_DUPLICATE (mode, x);
|
||||
emit_insn (gen_rtx_SET (VOIDmode, target, x));
|
||||
return true;
|
||||
}
|
||||
else
|
||||
{
|
||||
smode = HImode;
|
||||
wsmode = SImode;
|
||||
wvmode = V2SImode;
|
||||
goto widen;
|
||||
}
|
||||
goto widen;
|
||||
|
||||
case V8QImode:
|
||||
if (!mmx_ok)
|
||||
return false;
|
||||
smode = QImode;
|
||||
wsmode = HImode;
|
||||
wvmode = V4HImode;
|
||||
goto widen;
|
||||
|
||||
case V8HImode:
|
||||
if (TARGET_SSE2)
|
||||
{
|
||||
struct expand_vec_perm_d dperm;
|
||||
rtx tmp1, tmp2;
|
||||
/* Extend HImode to SImode using a paradoxical SUBREG. */
|
||||
|
||||
permute:
|
||||
memset (&dperm, 0, sizeof (dperm));
|
||||
dperm.target = target;
|
||||
dperm.vmode = mode;
|
||||
dperm.nelt = GET_MODE_NUNITS (mode);
|
||||
dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
|
||||
|
||||
/* Extend to SImode using a paradoxical SUBREG. */
|
||||
tmp1 = gen_reg_rtx (SImode);
|
||||
emit_move_insn (tmp1, gen_lowpart (SImode, val));
|
||||
/* Insert the SImode value as low element of V4SImode vector. */
|
||||
tmp2 = gen_reg_rtx (V4SImode);
|
||||
tmp1 = gen_rtx_VEC_MERGE (V4SImode,
|
||||
gen_rtx_VEC_DUPLICATE (V4SImode, tmp1),
|
||||
CONST0_RTX (V4SImode),
|
||||
const1_rtx);
|
||||
emit_insn (gen_rtx_SET (VOIDmode, tmp2, tmp1));
|
||||
/* Cast the V4SImode vector back to a V8HImode vector. */
|
||||
tmp1 = gen_reg_rtx (V8HImode);
|
||||
emit_move_insn (tmp1, gen_lowpart (V8HImode, tmp2));
|
||||
/* Duplicate the low short through the whole low SImode word. */
|
||||
emit_insn (gen_vec_interleave_lowv8hi (tmp1, tmp1, tmp1));
|
||||
/* Cast the V8HImode vector back to a V4SImode vector. */
|
||||
tmp2 = gen_reg_rtx (V4SImode);
|
||||
emit_move_insn (tmp2, gen_lowpart (V4SImode, tmp1));
|
||||
/* Replicate the low element of the V4SImode vector. */
|
||||
emit_insn (gen_sse2_pshufd (tmp2, tmp2, const0_rtx));
|
||||
/* Cast the V2SImode back to V8HImode, and store in target. */
|
||||
emit_move_insn (target, gen_lowpart (V8HImode, tmp2));
|
||||
return true;
|
||||
|
||||
/* Insert the SImode value as low element of a V4SImode vector. */
|
||||
tmp2 = gen_lowpart (V4SImode, dperm.op0);
|
||||
emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
|
||||
|
||||
ok = (expand_vec_perm_1 (&dperm)
|
||||
|| expand_vec_perm_broadcast_1 (&dperm));
|
||||
gcc_assert (ok);
|
||||
return ok;
|
||||
}
|
||||
smode = HImode;
|
||||
wsmode = SImode;
|
||||
wvmode = V4SImode;
|
||||
goto widen;
|
||||
|
||||
case V16QImode:
|
||||
if (TARGET_SSE2)
|
||||
{
|
||||
rtx tmp1, tmp2;
|
||||
/* Extend QImode to SImode using a paradoxical SUBREG. */
|
||||
tmp1 = gen_reg_rtx (SImode);
|
||||
emit_move_insn (tmp1, gen_lowpart (SImode, val));
|
||||
/* Insert the SImode value as low element of V4SImode vector. */
|
||||
tmp2 = gen_reg_rtx (V4SImode);
|
||||
tmp1 = gen_rtx_VEC_MERGE (V4SImode,
|
||||
gen_rtx_VEC_DUPLICATE (V4SImode, tmp1),
|
||||
CONST0_RTX (V4SImode),
|
||||
const1_rtx);
|
||||
emit_insn (gen_rtx_SET (VOIDmode, tmp2, tmp1));
|
||||
/* Cast the V4SImode vector back to a V16QImode vector. */
|
||||
tmp1 = gen_reg_rtx (V16QImode);
|
||||
emit_move_insn (tmp1, gen_lowpart (V16QImode, tmp2));
|
||||
/* Duplicate the low byte through the whole low SImode word. */
|
||||
emit_insn (gen_vec_interleave_lowv16qi (tmp1, tmp1, tmp1));
|
||||
emit_insn (gen_vec_interleave_lowv16qi (tmp1, tmp1, tmp1));
|
||||
/* Cast the V16QImode vector back to a V4SImode vector. */
|
||||
tmp2 = gen_reg_rtx (V4SImode);
|
||||
emit_move_insn (tmp2, gen_lowpart (V4SImode, tmp1));
|
||||
/* Replicate the low element of the V4SImode vector. */
|
||||
emit_insn (gen_sse2_pshufd (tmp2, tmp2, const0_rtx));
|
||||
/* Cast the V2SImode back to V16QImode, and store in target. */
|
||||
emit_move_insn (target, gen_lowpart (V16QImode, tmp2));
|
||||
return true;
|
||||
}
|
||||
smode = QImode;
|
||||
wsmode = HImode;
|
||||
wvmode = V8HImode;
|
||||
goto permute;
|
||||
goto widen;
|
||||
|
||||
widen:
|
||||
/* Replicate the value once into the next wider mode and recurse. */
|
||||
val = convert_modes (wsmode, smode, val, true);
|
||||
x = expand_simple_binop (wsmode, ASHIFT, val,
|
||||
GEN_INT (GET_MODE_BITSIZE (smode)),
|
||||
NULL_RTX, 1, OPTAB_LIB_WIDEN);
|
||||
val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
|
||||
|
||||
x = gen_reg_rtx (wvmode);
|
||||
if (!ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val))
|
||||
gcc_unreachable ();
|
||||
emit_move_insn (target, gen_lowpart (mode, x));
|
||||
return true;
|
||||
|
||||
case V4DFmode:
|
||||
hmode = V2DFmode;
|
||||
goto half;
|
||||
case V4DImode:
|
||||
hmode = V2DImode;
|
||||
goto half;
|
||||
case V8SFmode:
|
||||
hmode = V4SFmode;
|
||||
goto half;
|
||||
case V8SImode:
|
||||
hmode = V4SImode;
|
||||
goto half;
|
||||
case V16HImode:
|
||||
hmode = V8HImode;
|
||||
goto half;
|
||||
case V32QImode:
|
||||
hmode = V16QImode;
|
||||
goto half;
|
||||
half:
|
||||
{
|
||||
rtx tmp = gen_reg_rtx (hmode);
|
||||
ix86_expand_vector_init_duplicate (mmx_ok, hmode, tmp, val);
|
||||
emit_insn (gen_rtx_SET (VOIDmode, target,
|
||||
gen_rtx_VEC_CONCAT (mode, tmp, tmp)));
|
||||
enum machine_mode smode, wsmode, wvmode;
|
||||
rtx x;
|
||||
|
||||
smode = GET_MODE_INNER (mode);
|
||||
wvmode = get_mode_wider_vector (mode);
|
||||
wsmode = GET_MODE_INNER (wvmode);
|
||||
|
||||
val = convert_modes (wsmode, smode, val, true);
|
||||
x = expand_simple_binop (wsmode, ASHIFT, val,
|
||||
GEN_INT (GET_MODE_BITSIZE (smode)),
|
||||
NULL_RTX, 1, OPTAB_LIB_WIDEN);
|
||||
val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
|
||||
|
||||
x = gen_lowpart (wvmode, target);
|
||||
ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
|
||||
gcc_assert (ok);
|
||||
return ok;
|
||||
}
|
||||
|
||||
case V16HImode:
|
||||
case V32QImode:
|
||||
{
|
||||
enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
|
||||
rtx x = gen_reg_rtx (hvmode);
|
||||
|
||||
ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
|
||||
gcc_assert (ok);
|
||||
|
||||
x = gen_rtx_VEC_CONCAT (mode, x, x);
|
||||
emit_insn (gen_rtx_SET (VOIDmode, target, x));
|
||||
}
|
||||
return true;
|
||||
|
||||
@ -29085,19 +29107,6 @@ ix86_vectorize_builtin_vec_perm (tree vec_type, tree *mask_type)
|
||||
return ix86_builtins[(int) fcode];
|
||||
}
|
||||
|
||||
/* AVX does not support 32-byte integer vector operations,
|
||||
thus the longest vector we are faced with is V16QImode. */
|
||||
#define MAX_VECT_LEN 16
|
||||
|
||||
struct expand_vec_perm_d
|
||||
{
|
||||
rtx target, op0, op1;
|
||||
unsigned char perm[MAX_VECT_LEN];
|
||||
enum machine_mode vmode;
|
||||
unsigned char nelt;
|
||||
bool testing_p;
|
||||
};
|
||||
|
||||
/* Return a vector mode with twice as many elements as VMODE. */
|
||||
/* ??? Consider moving this to a table generated by genmodes.c. */
|
||||
|
||||
@ -29739,8 +29748,8 @@ expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
|
||||
return true;
|
||||
}
|
||||
|
||||
/* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
|
||||
extract-even and extract-odd permutations. */
|
||||
/* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
|
||||
and extract-odd permutations. */
|
||||
|
||||
static bool
|
||||
expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
|
||||
@ -29855,6 +29864,9 @@ expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
|
||||
return true;
|
||||
}
|
||||
|
||||
/* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
|
||||
extract-even and extract-odd permutations. */
|
||||
|
||||
static bool
|
||||
expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
|
||||
{
|
||||
@ -29871,6 +29883,84 @@ expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
|
||||
return expand_vec_perm_even_odd_1 (d, odd);
|
||||
}
|
||||
|
||||
/* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
|
||||
permutations. We assume that expand_vec_perm_1 has already failed. */
|
||||
|
||||
static bool
|
||||
expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
|
||||
{
|
||||
unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
|
||||
enum machine_mode vmode = d->vmode;
|
||||
unsigned char perm2[4];
|
||||
rtx op0 = d->op0;
|
||||
bool ok;
|
||||
|
||||
switch (vmode)
|
||||
{
|
||||
case V4DFmode:
|
||||
case V8SFmode:
|
||||
/* These are special-cased in sse.md so that we can optionally
|
||||
use the vbroadcast instruction. They expand to two insns
|
||||
if the input happens to be in a register. */
|
||||
gcc_unreachable ();
|
||||
|
||||
case V2DFmode:
|
||||
case V2DImode:
|
||||
case V4SFmode:
|
||||
case V4SImode:
|
||||
/* These are always implementable using standard shuffle patterns. */
|
||||
gcc_unreachable ();
|
||||
|
||||
case V8HImode:
|
||||
case V16QImode:
|
||||
/* These can be implemented via interleave. We save one insn by
|
||||
stopping once we have promoted to V4SImode and then use pshufd. */
|
||||
do
|
||||
{
|
||||
optab otab = vec_interleave_low_optab;
|
||||
|
||||
if (elt >= nelt2)
|
||||
{
|
||||
otab = vec_interleave_high_optab;
|
||||
elt -= nelt2;
|
||||
}
|
||||
nelt2 /= 2;
|
||||
|
||||
op0 = expand_binop (vmode, otab, op0, op0, NULL, 0, OPTAB_DIRECT);
|
||||
vmode = get_mode_wider_vector (vmode);
|
||||
op0 = gen_lowpart (vmode, op0);
|
||||
}
|
||||
while (vmode != V4SImode);
|
||||
|
||||
memset (perm2, elt, 4);
|
||||
ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4);
|
||||
gcc_assert (ok);
|
||||
return true;
|
||||
|
||||
default:
|
||||
gcc_unreachable ();
|
||||
}
|
||||
}
|
||||
|
||||
/* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
|
||||
broadcast permutations. */
|
||||
|
||||
static bool
|
||||
expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
|
||||
{
|
||||
unsigned i, elt, nelt = d->nelt;
|
||||
|
||||
if (d->op0 != d->op1)
|
||||
return false;
|
||||
|
||||
elt = d->perm[0];
|
||||
for (i = 1; i < nelt; ++i)
|
||||
if (d->perm[i] != elt)
|
||||
return false;
|
||||
|
||||
return expand_vec_perm_broadcast_1 (d);
|
||||
}
|
||||
|
||||
/* The guts of ix86_expand_vec_perm_builtin, also used by the ok hook.
|
||||
With all of the interface bits taken care of, perform the expansion
|
||||
in D and return true on success. */
|
||||
@ -29878,8 +29968,7 @@ expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
|
||||
static bool
|
||||
ix86_expand_vec_perm_builtin_1 (struct expand_vec_perm_d *d)
|
||||
{
|
||||
/* First things first -- check if the instruction is implementable
|
||||
with a single instruction. */
|
||||
/* Try a single instruction expansion. */
|
||||
if (expand_vec_perm_1 (d))
|
||||
return true;
|
||||
|
||||
@ -29894,13 +29983,16 @@ ix86_expand_vec_perm_builtin_1 (struct expand_vec_perm_d *d)
|
||||
if (expand_vec_perm_interleave2 (d))
|
||||
return true;
|
||||
|
||||
if (expand_vec_perm_broadcast (d))
|
||||
return true;
|
||||
|
||||
/* Try sequences of three instructions. */
|
||||
|
||||
if (expand_vec_perm_pshufb2 (d))
|
||||
return true;
|
||||
|
||||
/* ??? Look for narrow permutations whose element orderings would
|
||||
allow the promition to a wider mode. */
|
||||
allow the promotion to a wider mode. */
|
||||
|
||||
/* ??? Look for sequences of interleave or a wider permute that place
|
||||
the data into the correct lanes for a half-vector shuffle like
|
||||
@ -29912,8 +30004,6 @@ ix86_expand_vec_perm_builtin_1 (struct expand_vec_perm_d *d)
|
||||
if (expand_vec_perm_even_odd (d))
|
||||
return true;
|
||||
|
||||
/* ??? Pattern match broadcast. */
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -1241,3 +1241,20 @@
|
||||
(define_predicate "avx_vperm2f128_v4df_operand"
|
||||
(and (match_code "parallel")
|
||||
(match_test "avx_vperm2f128_parallel (op, V4DFmode)")))
|
||||
|
||||
;; Return 1 if OP is a parallel for a vbroadcast permute.
|
||||
|
||||
(define_predicate "avx_vbroadcast_operand"
|
||||
(and (match_code "parallel")
|
||||
(match_code "const_int" "a"))
|
||||
{
|
||||
rtx elt = XVECEXP (op, 0, 0);
|
||||
int i, nelt = XVECLEN (op, 0);
|
||||
|
||||
/* Don't bother checking there are the right number of operands,
|
||||
merely that they're all identical. */
|
||||
for (i = 1; i < nelt; ++i)
|
||||
if (XVECEXP (op, 0, i) != elt)
|
||||
return false;
|
||||
return true;
|
||||
})
|
||||
|
@ -54,6 +54,7 @@
|
||||
|
||||
(define_mode_iterator AVX256MODEF2P [V8SF V4DF])
|
||||
(define_mode_iterator AVX256MODE2P [V8SI V8SF V4DF])
|
||||
(define_mode_iterator AVX256MODE24P [V8SI V8SF V4DI V4DF])
|
||||
(define_mode_iterator AVX256MODE4P [V4DI V4DF])
|
||||
(define_mode_iterator AVX256MODE8P [V8SI V8SF])
|
||||
(define_mode_iterator AVXMODEF2P [V4SF V2DF V8SF V4DF])
|
||||
@ -96,6 +97,8 @@
|
||||
|
||||
(define_mode_attr ssemodesuffixf2c [(V4SF "s") (V2DF "d")])
|
||||
|
||||
(define_mode_attr ssescalarmodesuffix2s [(V4SF "ss") (V4SI "d")])
|
||||
|
||||
;; Mapping of the max integer size for xop rotate immediate constraint
|
||||
(define_mode_attr sserotatemax [(V16QI "7") (V8HI "15") (V4SI "31") (V2DI "63")])
|
||||
|
||||
@ -125,17 +128,18 @@
|
||||
[(V16QI "V4SF") (V8HI "V4SF") (V4SI "V4SF") (V2DI "V4SF")
|
||||
(V32QI "V8SF") (V16HI "V8SF") (V8SI "V8SF") (V4DI "V8SF")])
|
||||
(define_mode_attr avxhalfvecmode
|
||||
[(V4SF "V2SF") (V32QI "V16QI") (V16HI "V8HI") (V8SI "V4SI")
|
||||
(V4DI "V2DI") (V8SF "V4SF") (V4DF "V2DF")])
|
||||
[(V32QI "V16QI") (V16HI "V8HI") (V8SI "V4SI") (V4DI "V2DI")
|
||||
(V8SF "V4SF") (V4DF "V2DF")
|
||||
(V16QI "V8QI") (V8HI "V4HI") (V4SI "V2SI") (V4SF "V2SF")])
|
||||
(define_mode_attr avxscalarmode
|
||||
[(V16QI "QI") (V8HI "HI") (V4SI "SI") (V4SF "SF") (V2DF "DF")
|
||||
(V8SF "SF") (V4DF "DF")])
|
||||
[(V16QI "QI") (V8HI "HI") (V4SI "SI") (V2DI "DI") (V4SF "SF") (V2DF "DF")
|
||||
(V32QI "QI") (V16HI "HI") (V8SI "SI") (V4DI "DI") (V8SF "SF") (V4DF "DF")])
|
||||
(define_mode_attr avxcvtvecmode
|
||||
[(V4SF "V4SI") (V8SF "V8SI") (V4SI "V4SF") (V8SI "V8SF")])
|
||||
(define_mode_attr avxpermvecmode
|
||||
[(V2DF "V2DI") (V4SF "V4SI") (V4DF "V4DI") (V8SF "V8SI")])
|
||||
(define_mode_attr avxmodesuffixf2c
|
||||
[(V4SF "s") (V2DF "d") (V8SF "s") (V4DF "d")])
|
||||
[(V4SF "s") (V2DF "d") (V8SI "s") (V8SF "s") (V4DI "d") (V4DF "d")])
|
||||
(define_mode_attr avxmodesuffixp
|
||||
[(V2DF "pd") (V4SI "si") (V4SF "ps") (V8SF "ps") (V8SI "si")
|
||||
(V4DF "pd")])
|
||||
@ -4012,14 +4016,27 @@
|
||||
[(set_attr "type" "ssemov")
|
||||
(set_attr "mode" "SF")])
|
||||
|
||||
(define_insn "*vec_dupv4sf_avx"
|
||||
[(set (match_operand:V4SF 0 "register_operand" "=x")
|
||||
(define_expand "vec_dupv4sf"
|
||||
[(set (match_operand:V4SF 0 "register_operand" "")
|
||||
(vec_duplicate:V4SF
|
||||
(match_operand:SF 1 "register_operand" "x")))]
|
||||
(match_operand:SF 1 "nonimmediate_operand" "")))]
|
||||
"TARGET_SSE"
|
||||
{
|
||||
if (!TARGET_AVX)
|
||||
operands[1] = force_reg (V4SFmode, operands[1]);
|
||||
})
|
||||
|
||||
(define_insn "*vec_dupv4sf_avx"
|
||||
[(set (match_operand:V4SF 0 "register_operand" "=x,x")
|
||||
(vec_duplicate:V4SF
|
||||
(match_operand:SF 1 "nonimmediate_operand" "x,m")))]
|
||||
"TARGET_AVX"
|
||||
"vshufps\t{$0, %1, %1, %0|%0, %1, %1, 0}"
|
||||
[(set_attr "type" "sselog1")
|
||||
(set_attr "length_immediate" "1")
|
||||
"@
|
||||
vshufps\t{$0, %1, %1, %0|%0, %1, %1, 0}
|
||||
vbroadcastss\t{%1, %0|%0, %1}"
|
||||
[(set_attr "type" "sselog1,ssemov")
|
||||
(set_attr "length_immediate" "1,0")
|
||||
(set_attr "prefix_extra" "0,1")
|
||||
(set_attr "prefix" "vex")
|
||||
(set_attr "mode" "V4SF")])
|
||||
|
||||
@ -4125,35 +4142,78 @@
|
||||
DONE;
|
||||
})
|
||||
|
||||
(define_insn "*vec_setv4sf_0_avx"
|
||||
[(set (match_operand:V4SF 0 "nonimmediate_operand" "=x,x,x,m")
|
||||
(vec_merge:V4SF
|
||||
(vec_duplicate:V4SF
|
||||
(match_operand:SF 2 "general_operand" " x,m,*r,x*rfF"))
|
||||
(match_operand:V4SF 1 "vector_move_operand" " x,C,C ,0")
|
||||
(define_insn "*vec_set<mode>_0_avx"
|
||||
[(set (match_operand:SSEMODE4S 0 "nonimmediate_operand" "=x,x, x,x, x,m")
|
||||
(vec_merge:SSEMODE4S
|
||||
(vec_duplicate:SSEMODE4S
|
||||
(match_operand:<ssescalarmode> 2
|
||||
"general_operand" " x,m,*r,x,*rm,x*rfF"))
|
||||
(match_operand:SSEMODE4S 1 "vector_move_operand" " C,C, C,x, x,0")
|
||||
(const_int 1)))]
|
||||
"TARGET_AVX"
|
||||
"@
|
||||
vmovss\t{%2, %1, %0|%0, %1, %2}
|
||||
vmovss\t{%2, %0|%0, %2}
|
||||
vinsertps\t{$0xe, %2, %2, %0|%0, %2, %2, 0xe}
|
||||
vmov<ssescalarmodesuffix2s>\t{%2, %0|%0, %2}
|
||||
vmovd\t{%2, %0|%0, %2}
|
||||
vmovss\t{%2, %1, %0|%0, %1, %2}
|
||||
vpinsrd\t{$0, %2, %1, %0|%0, %1, %2, 0}
|
||||
#"
|
||||
[(set_attr "type" "sselog,ssemov,ssemov,ssemov,sselog,*")
|
||||
(set_attr "prefix_extra" "*,*,*,*,1,*")
|
||||
(set_attr "length_immediate" "*,*,*,*,1,*")
|
||||
(set_attr "prefix" "vex")
|
||||
(set_attr "mode" "SF,<ssescalarmode>,SI,SF,TI,*")])
|
||||
|
||||
(define_insn "*vec_set<mode>_0_sse4_1"
|
||||
[(set (match_operand:SSEMODE4S 0 "nonimmediate_operand" "=x,x, x,x, x,m")
|
||||
(vec_merge:SSEMODE4S
|
||||
(vec_duplicate:SSEMODE4S
|
||||
(match_operand:<ssescalarmode> 2
|
||||
"general_operand" " x,m,*r,x,*rm,*rfF"))
|
||||
(match_operand:SSEMODE4S 1 "vector_move_operand" " C,C, C,0, 0,0")
|
||||
(const_int 1)))]
|
||||
"TARGET_SSE4_1"
|
||||
"@
|
||||
insertps\t{$0xe, %2, %0|%0, %2, 0xe}
|
||||
mov<ssescalarmodesuffix2s>\t{%2, %0|%0, %2}
|
||||
movd\t{%2, %0|%0, %2}
|
||||
movss\t{%2, %0|%0, %2}
|
||||
pinsrd\t{$0, %2, %0|%0, %2, 0}
|
||||
#"
|
||||
[(set_attr "type" "sselog,ssemov,ssemov,ssemov,sselog,*")
|
||||
(set_attr "prefix_extra" "*,*,*,*,1,*")
|
||||
(set_attr "length_immediate" "*,*,*,*,1,*")
|
||||
(set_attr "mode" "SF,<ssescalarmode>,SI,SF,TI,*")])
|
||||
|
||||
(define_insn "*vec_set<mode>_0_sse2"
|
||||
[(set (match_operand:SSEMODE4S 0 "nonimmediate_operand" "=x, x,x,m")
|
||||
(vec_merge:SSEMODE4S
|
||||
(vec_duplicate:SSEMODE4S
|
||||
(match_operand:<ssescalarmode> 2
|
||||
"general_operand" " m,*r,x,x*rfF"))
|
||||
(match_operand:SSEMODE4S 1 "vector_move_operand" " C, C,0,0")
|
||||
(const_int 1)))]
|
||||
"TARGET_SSE2"
|
||||
"@
|
||||
mov<ssescalarmodesuffix2s>\t{%2, %0|%0, %2}
|
||||
movd\t{%2, %0|%0, %2}
|
||||
movss\t{%2, %0|%0, %2}
|
||||
#"
|
||||
[(set_attr "type" "ssemov")
|
||||
(set_attr "prefix" "vex")
|
||||
(set_attr "mode" "SF")])
|
||||
(set_attr "mode" "<ssescalarmode>,SI,SF,*")])
|
||||
|
||||
(define_insn "vec_setv4sf_0"
|
||||
[(set (match_operand:V4SF 0 "nonimmediate_operand" "=x,x,Y2,m")
|
||||
(vec_merge:V4SF
|
||||
(vec_duplicate:V4SF
|
||||
(match_operand:SF 2 "general_operand" " x,m,*r,x*rfF"))
|
||||
(match_operand:V4SF 1 "vector_move_operand" " 0,C,C ,0")
|
||||
(define_insn "vec_set<mode>_0"
|
||||
[(set (match_operand:SSEMODE4S 0 "nonimmediate_operand" "=x,x,m")
|
||||
(vec_merge:SSEMODE4S
|
||||
(vec_duplicate:SSEMODE4S
|
||||
(match_operand:<ssescalarmode> 2
|
||||
"general_operand" " m,x,x*rfF"))
|
||||
(match_operand:SSEMODE4S 1 "vector_move_operand" " C,0,0")
|
||||
(const_int 1)))]
|
||||
"TARGET_SSE"
|
||||
"@
|
||||
movss\t{%2, %0|%0, %2}
|
||||
movss\t{%2, %0|%0, %2}
|
||||
movd\t{%2, %0|%0, %2}
|
||||
#"
|
||||
[(set_attr "type" "ssemov")
|
||||
(set_attr "mode" "SF")])
|
||||
@ -4484,7 +4544,7 @@
|
||||
(set_attr "mode" "V4DF")])
|
||||
|
||||
(define_expand "vec_interleave_highv2df"
|
||||
[(set (match_operand:V2DF 0 "nonimmediate_operand" "")
|
||||
[(set (match_operand:V2DF 0 "register_operand" "")
|
||||
(vec_select:V2DF
|
||||
(vec_concat:V4DF
|
||||
(match_operand:V2DF 1 "nonimmediate_operand" "")
|
||||
@ -4492,24 +4552,46 @@
|
||||
(parallel [(const_int 1)
|
||||
(const_int 3)])))]
|
||||
"TARGET_SSE2"
|
||||
"ix86_fixup_binary_operands (UNKNOWN, V2DFmode, operands);")
|
||||
{
|
||||
if (!ix86_vec_interleave_v2df_operator_ok (operands, 1))
|
||||
operands[2] = force_reg (V2DFmode, operands[2]);
|
||||
})
|
||||
|
||||
(define_insn "*avx_interleave_highv2df"
|
||||
[(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,m")
|
||||
[(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,x,m")
|
||||
(vec_select:V2DF
|
||||
(vec_concat:V4DF
|
||||
(match_operand:V2DF 1 "nonimmediate_operand" " x,o,x")
|
||||
(match_operand:V2DF 2 "nonimmediate_operand" " x,x,0"))
|
||||
(match_operand:V2DF 1 "nonimmediate_operand" " x,o,o,x")
|
||||
(match_operand:V2DF 2 "nonimmediate_operand" " x,1,x,0"))
|
||||
(parallel [(const_int 1)
|
||||
(const_int 3)])))]
|
||||
"TARGET_AVX && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
|
||||
"TARGET_AVX && ix86_vec_interleave_v2df_operator_ok (operands, 1)"
|
||||
"@
|
||||
vunpckhpd\t{%2, %1, %0|%0, %1, %2}
|
||||
vmovddup\t{%H1, %0|%0, %H1}
|
||||
vmovlpd\t{%H1, %2, %0|%0, %2, %H1}
|
||||
vmovhpd\t{%1, %0|%0, %1}"
|
||||
[(set_attr "type" "sselog,ssemov,ssemov")
|
||||
[(set_attr "type" "sselog,sselog,ssemov,ssemov")
|
||||
(set_attr "prefix" "vex")
|
||||
(set_attr "mode" "V2DF,V1DF,V1DF")])
|
||||
(set_attr "mode" "V2DF,V2DF,V1DF,V1DF")])
|
||||
|
||||
(define_insn "*sse3_interleave_highv2df"
|
||||
[(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,x,m")
|
||||
(vec_select:V2DF
|
||||
(vec_concat:V4DF
|
||||
(match_operand:V2DF 1 "nonimmediate_operand" " 0,o,o,x")
|
||||
(match_operand:V2DF 2 "nonimmediate_operand" " x,1,0,0"))
|
||||
(parallel [(const_int 1)
|
||||
(const_int 3)])))]
|
||||
"TARGET_SSE3 && ix86_vec_interleave_v2df_operator_ok (operands, 1)"
|
||||
"@
|
||||
unpckhpd\t{%2, %0|%0, %2}
|
||||
movddup\t{%H1, %0|%0, %H1}
|
||||
movlpd\t{%H1, %0|%0, %H1}
|
||||
movhpd\t{%1, %0|%0, %1}"
|
||||
[(set_attr "type" "sselog,sselog,ssemov,ssemov")
|
||||
(set_attr "prefix_data16" "*,*,1,1")
|
||||
(set_attr "mode" "V2DF,V2DF,V1DF,V1DF")])
|
||||
|
||||
(define_insn "*sse2_interleave_highv2df"
|
||||
[(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,m")
|
||||
@ -4519,7 +4601,7 @@
|
||||
(match_operand:V2DF 2 "nonimmediate_operand" " x,0,0"))
|
||||
(parallel [(const_int 1)
|
||||
(const_int 3)])))]
|
||||
"TARGET_SSE2 && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
|
||||
"TARGET_SSE2 && ix86_vec_interleave_v2df_operator_ok (operands, 1)"
|
||||
"@
|
||||
unpckhpd\t{%2, %0|%0, %2}
|
||||
movlpd\t{%H1, %0|%0, %H1}
|
||||
@ -4528,50 +4610,112 @@
|
||||
(set_attr "prefix_data16" "*,1,1")
|
||||
(set_attr "mode" "V2DF,V1DF,V1DF")])
|
||||
|
||||
(define_insn "avx_movddup256"
|
||||
[(set (match_operand:V4DF 0 "register_operand" "=x")
|
||||
;; Recall that the 256-bit unpck insns only shuffle within their lanes.
|
||||
(define_expand "avx_movddup256"
|
||||
[(set (match_operand:V4DF 0 "register_operand" "")
|
||||
(vec_select:V4DF
|
||||
(vec_concat:V8DF
|
||||
(match_operand:V4DF 1 "nonimmediate_operand" "xm")
|
||||
(match_operand:V4DF 1 "nonimmediate_operand" "")
|
||||
(match_dup 1))
|
||||
(parallel [(const_int 0) (const_int 2)
|
||||
(const_int 4) (const_int 6)])))]
|
||||
(parallel [(const_int 0) (const_int 4)
|
||||
(const_int 2) (const_int 6)])))]
|
||||
"TARGET_AVX"
|
||||
"vmovddup\t{%1, %0|%0, %1}"
|
||||
[(set_attr "type" "sselog1")
|
||||
"")
|
||||
|
||||
(define_expand "avx_unpcklpd256"
|
||||
[(set (match_operand:V4DF 0 "register_operand" "")
|
||||
(vec_select:V4DF
|
||||
(vec_concat:V8DF
|
||||
(match_operand:V4DF 1 "register_operand" "")
|
||||
(match_operand:V4DF 2 "nonimmediate_operand" ""))
|
||||
(parallel [(const_int 0) (const_int 4)
|
||||
(const_int 2) (const_int 6)])))]
|
||||
"TARGET_AVX"
|
||||
"")
|
||||
|
||||
(define_insn "*avx_unpcklpd256"
|
||||
[(set (match_operand:V4DF 0 "register_operand" "=x,x")
|
||||
(vec_select:V4DF
|
||||
(vec_concat:V8DF
|
||||
(match_operand:V4DF 1 "nonimmediate_operand" "xm,x")
|
||||
(match_operand:V4DF 2 "nonimmediate_operand" " 1,xm"))
|
||||
(parallel [(const_int 0) (const_int 4)
|
||||
(const_int 2) (const_int 6)])))]
|
||||
"TARGET_AVX
|
||||
&& (!MEM_P (operands[1]) || rtx_equal_p (operands[1], operands[2]))"
|
||||
"@
|
||||
vmovddup\t{%1, %0|%0, %1}
|
||||
vunpcklpd\t{%2, %1, %0|%0, %1, %2}"
|
||||
[(set_attr "type" "sselog")
|
||||
(set_attr "prefix" "vex")
|
||||
(set_attr "mode" "V4DF")])
|
||||
|
||||
(define_insn "*avx_movddup"
|
||||
[(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,o")
|
||||
(define_expand "vec_interleave_lowv2df"
|
||||
[(set (match_operand:V2DF 0 "register_operand" "")
|
||||
(vec_select:V2DF
|
||||
(vec_concat:V4DF
|
||||
(match_operand:V2DF 1 "nonimmediate_operand" "xm,x")
|
||||
(match_dup 1))
|
||||
(match_operand:V2DF 1 "nonimmediate_operand" "")
|
||||
(match_operand:V2DF 2 "nonimmediate_operand" ""))
|
||||
(parallel [(const_int 0)
|
||||
(const_int 2)])))]
|
||||
"TARGET_AVX && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
|
||||
"@
|
||||
vmovddup\t{%1, %0|%0, %1}
|
||||
#"
|
||||
[(set_attr "type" "sselog1,ssemov")
|
||||
(set_attr "prefix" "vex")
|
||||
(set_attr "mode" "V2DF")])
|
||||
"TARGET_SSE2"
|
||||
{
|
||||
if (!ix86_vec_interleave_v2df_operator_ok (operands, 0))
|
||||
operands[1] = force_reg (V2DFmode, operands[1]);
|
||||
})
|
||||
|
||||
(define_insn "*sse3_movddup"
|
||||
[(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,o")
|
||||
(define_insn "*avx_interleave_lowv2df"
|
||||
[(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,x,o")
|
||||
(vec_select:V2DF
|
||||
(vec_concat:V4DF
|
||||
(match_operand:V2DF 1 "nonimmediate_operand" "xm,x")
|
||||
(match_dup 1))
|
||||
(match_operand:V2DF 1 "nonimmediate_operand" " x,m,x,0")
|
||||
(match_operand:V2DF 2 "nonimmediate_operand" " x,1,m,x"))
|
||||
(parallel [(const_int 0)
|
||||
(const_int 2)])))]
|
||||
"TARGET_SSE3 && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
|
||||
"TARGET_AVX && ix86_vec_interleave_v2df_operator_ok (operands, 0)"
|
||||
"@
|
||||
vunpcklpd\t{%2, %1, %0|%0, %1, %2}
|
||||
vmovddup\t{%1, %0|%0, %1}
|
||||
vmovhpd\t{%2, %1, %0|%0, %1, %2}
|
||||
vmovlpd\t{%2, %H0|%H0, %2}"
|
||||
[(set_attr "type" "sselog,sselog,ssemov,ssemov")
|
||||
(set_attr "prefix" "vex")
|
||||
(set_attr "mode" "V2DF,V2DF,V1DF,V1DF")])
|
||||
|
||||
(define_insn "*sse3_interleave_lowv2df"
|
||||
[(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,x,o")
|
||||
(vec_select:V2DF
|
||||
(vec_concat:V4DF
|
||||
(match_operand:V2DF 1 "nonimmediate_operand" " 0,m,0,0")
|
||||
(match_operand:V2DF 2 "nonimmediate_operand" " x,1,m,x"))
|
||||
(parallel [(const_int 0)
|
||||
(const_int 2)])))]
|
||||
"TARGET_SSE3 && ix86_vec_interleave_v2df_operator_ok (operands, 0)"
|
||||
"@
|
||||
unpcklpd\t{%2, %0|%0, %2}
|
||||
movddup\t{%1, %0|%0, %1}
|
||||
#"
|
||||
[(set_attr "type" "sselog1,ssemov")
|
||||
(set_attr "mode" "V2DF")])
|
||||
movhpd\t{%2, %0|%0, %2}
|
||||
movlpd\t{%2, %H0|%H0, %2}"
|
||||
[(set_attr "type" "sselog,sselog,ssemov,ssemov")
|
||||
(set_attr "prefix_data16" "*,*,1,1")
|
||||
(set_attr "mode" "V2DF,V2DF,V1DF,V1DF")])
|
||||
|
||||
(define_insn "*sse2_interleave_lowv2df"
|
||||
[(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,o")
|
||||
(vec_select:V2DF
|
||||
(vec_concat:V4DF
|
||||
(match_operand:V2DF 1 "nonimmediate_operand" " 0,0,0")
|
||||
(match_operand:V2DF 2 "nonimmediate_operand" " x,m,x"))
|
||||
(parallel [(const_int 0)
|
||||
(const_int 2)])))]
|
||||
"TARGET_SSE2 && ix86_vec_interleave_v2df_operator_ok (operands, 0)"
|
||||
"@
|
||||
unpcklpd\t{%2, %0|%0, %2}
|
||||
movhpd\t{%2, %0|%0, %2}
|
||||
movlpd\t{%2, %H0|%H0, %2}"
|
||||
[(set_attr "type" "sselog,ssemov,ssemov")
|
||||
(set_attr "prefix_data16" "*,1,1")
|
||||
(set_attr "mode" "V2DF,V1DF,V1DF")])
|
||||
|
||||
(define_split
|
||||
[(set (match_operand:V2DF 0 "memory_operand" "")
|
||||
@ -4590,65 +4734,19 @@
|
||||
DONE;
|
||||
})
|
||||
|
||||
;; Recall that the 256-bit unpck insns only shuffle within their lanes.
|
||||
(define_insn "avx_unpcklpd256"
|
||||
[(set (match_operand:V4DF 0 "register_operand" "=x")
|
||||
(vec_select:V4DF
|
||||
(vec_concat:V8DF
|
||||
(match_operand:V4DF 1 "register_operand" "x")
|
||||
(match_operand:V4DF 2 "nonimmediate_operand" "xm"))
|
||||
(parallel [(const_int 0) (const_int 4)
|
||||
(const_int 2) (const_int 6)])))]
|
||||
"TARGET_AVX"
|
||||
"vunpcklpd\t{%2, %1, %0|%0, %1, %2}"
|
||||
[(set_attr "type" "sselog")
|
||||
(set_attr "prefix" "vex")
|
||||
(set_attr "mode" "V4DF")])
|
||||
|
||||
(define_expand "vec_interleave_lowv2df"
|
||||
[(set (match_operand:V2DF 0 "nonimmediate_operand" "")
|
||||
(define_split
|
||||
[(set (match_operand:V2DF 0 "register_operand" "")
|
||||
(vec_select:V2DF
|
||||
(vec_concat:V4DF
|
||||
(match_operand:V2DF 1 "nonimmediate_operand" "")
|
||||
(match_operand:V2DF 2 "nonimmediate_operand" ""))
|
||||
(parallel [(const_int 0)
|
||||
(const_int 2)])))]
|
||||
"TARGET_SSE2"
|
||||
"ix86_fixup_binary_operands (UNKNOWN, V2DFmode, operands);")
|
||||
|
||||
(define_insn "*avx_interleave_lowv2df"
|
||||
[(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,o")
|
||||
(vec_select:V2DF
|
||||
(vec_concat:V4DF
|
||||
(match_operand:V2DF 1 "nonimmediate_operand" " x,x,0")
|
||||
(match_operand:V2DF 2 "nonimmediate_operand" " x,m,x"))
|
||||
(parallel [(const_int 0)
|
||||
(const_int 2)])))]
|
||||
"TARGET_AVX && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
|
||||
"@
|
||||
vunpcklpd\t{%2, %1, %0|%0, %1, %2}
|
||||
vmovhpd\t{%2, %1, %0|%0, %1, %2}
|
||||
vmovlpd\t{%2, %H0|%H0, %2}"
|
||||
[(set_attr "type" "sselog,ssemov,ssemov")
|
||||
(set_attr "prefix" "vex")
|
||||
(set_attr "mode" "V2DF,V1DF,V1DF")])
|
||||
|
||||
(define_insn "*sse2_interleave_lowv2df"
|
||||
[(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,o")
|
||||
(vec_select:V2DF
|
||||
(vec_concat:V4DF
|
||||
(match_operand:V2DF 1 "nonimmediate_operand" " 0,0,0")
|
||||
(match_operand:V2DF 2 "nonimmediate_operand" " x,m,x"))
|
||||
(parallel [(const_int 0)
|
||||
(const_int 2)])))]
|
||||
"TARGET_SSE2 && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
|
||||
"@
|
||||
unpcklpd\t{%2, %0|%0, %2}
|
||||
movhpd\t{%2, %0|%0, %2}
|
||||
movlpd\t{%2, %H0|%H0, %2}"
|
||||
[(set_attr "type" "sselog,ssemov,ssemov")
|
||||
(set_attr "prefix_data16" "*,1,1")
|
||||
(set_attr "mode" "V2DF,V1DF,V1DF")])
|
||||
(match_operand:V2DF 1 "memory_operand" "")
|
||||
(match_dup 1))
|
||||
(parallel [(match_operand:SI 2 "const_0_to_1_operand" "")
|
||||
(match_operand:SI 3 "const_int_operand" "")])))]
|
||||
"TARGET_SSE3 && INTVAL (operands[2]) + 2 == INTVAL (operands[3])"
|
||||
[(set (match_dup 0) (vec_duplicate:V2DF (match_dup 1)))]
|
||||
{
|
||||
operands[1] = adjust_address (operands[1], DFmode, INTVAL (operands[2]) * 8);
|
||||
})
|
||||
|
||||
(define_expand "avx_shufpd256"
|
||||
[(match_operand:V4DF 0 "register_operand" "")
|
||||
@ -7408,6 +7506,20 @@
|
||||
[(set_attr "type" "ssemov")
|
||||
(set_attr "mode" "V2SF,V4SF,V2SF")])
|
||||
|
||||
(define_insn "*vec_dupv4si_avx"
|
||||
[(set (match_operand:V4SI 0 "register_operand" "=x,x")
|
||||
(vec_duplicate:V4SI
|
||||
(match_operand:SI 1 "register_operand" "x,m")))]
|
||||
"TARGET_AVX"
|
||||
"@
|
||||
vpshufd\t{$0, %1, %0|%0, %1, 0}
|
||||
vbroadcastss\t{%1, %0|%0, %1}"
|
||||
[(set_attr "type" "sselog1,ssemov")
|
||||
(set_attr "length_immediate" "1,0")
|
||||
(set_attr "prefix_extra" "0,1")
|
||||
(set_attr "prefix" "vex")
|
||||
(set_attr "mode" "TI,V4SF")])
|
||||
|
||||
(define_insn "*vec_dupv4si"
|
||||
[(set (match_operand:V4SI 0 "register_operand" "=Y2,x")
|
||||
(vec_duplicate:V4SI
|
||||
@ -7417,19 +7529,31 @@
|
||||
%vpshufd\t{$0, %1, %0|%0, %1, 0}
|
||||
shufps\t{$0, %0, %0|%0, %0, 0}"
|
||||
[(set_attr "type" "sselog1")
|
||||
(set_attr "prefix" "maybe_vex,orig")
|
||||
(set_attr "length_immediate" "1")
|
||||
(set_attr "mode" "TI,V4SF")])
|
||||
|
||||
(define_insn "*vec_dupv2di_avx"
|
||||
[(set (match_operand:V2DI 0 "register_operand" "=x")
|
||||
[(set (match_operand:V2DI 0 "register_operand" "=x,x")
|
||||
(vec_duplicate:V2DI
|
||||
(match_operand:DI 1 "register_operand" "x")))]
|
||||
(match_operand:DI 1 "nonimmediate_operand" " x,m")))]
|
||||
"TARGET_AVX"
|
||||
"vpunpcklqdq\t{%1, %1, %0|%0, %1, %1}"
|
||||
"@
|
||||
vpunpcklqdq\t{%1, %1, %0|%0, %1, %1}
|
||||
vmovddup\t{%1, %0|%0, %1}"
|
||||
[(set_attr "type" "sselog1")
|
||||
(set_attr "prefix" "vex")
|
||||
(set_attr "mode" "TI")])
|
||||
(set_attr "mode" "TI,DF")])
|
||||
|
||||
(define_insn "*vec_dupv2di_sse3"
|
||||
[(set (match_operand:V2DI 0 "register_operand" "=x,x")
|
||||
(vec_duplicate:V2DI
|
||||
(match_operand:DI 1 "nonimmediate_operand" " 0,m")))]
|
||||
"TARGET_SSE3"
|
||||
"@
|
||||
punpcklqdq\t%0, %0
|
||||
movddup\t{%1, %0|%0, %1}"
|
||||
[(set_attr "type" "sselog1")
|
||||
(set_attr "mode" "TI,DF")])
|
||||
|
||||
(define_insn "*vec_dupv2di"
|
||||
[(set (match_operand:V2DI 0 "register_operand" "=Y2,x")
|
||||
@ -11838,6 +11962,108 @@
|
||||
(set_attr "prefix" "vex")
|
||||
(set_attr "mode" "OI")])
|
||||
|
||||
(define_insn_and_split "vec_dup<mode>"
|
||||
[(set (match_operand:AVX256MODE24P 0 "register_operand" "=x,x")
|
||||
(vec_duplicate:AVX256MODE24P
|
||||
(match_operand:<avxscalarmode> 1 "nonimmediate_operand" "m,?x")))]
|
||||
"TARGET_AVX"
|
||||
"@
|
||||
vbroadcasts<avxmodesuffixf2c>\t{%1, %0|%0, %1}
|
||||
#"
|
||||
"&& reload_completed && REG_P (operands[1])"
|
||||
[(set (match_dup 2) (vec_duplicate:<avxhalfvecmode> (match_dup 1)))
|
||||
(set (match_dup 0) (vec_concat:AVX256MODE24P (match_dup 2) (match_dup 2)))]
|
||||
{
|
||||
operands[2] = gen_rtx_REG (<avxhalfvecmode>mode, REGNO (operands[0]));
|
||||
}
|
||||
[(set_attr "type" "ssemov")
|
||||
(set_attr "prefix_extra" "1")
|
||||
(set_attr "prefix" "vex")
|
||||
(set_attr "mode" "V8SF")])
|
||||
|
||||
(define_insn "avx_vbroadcastf128_<mode>"
|
||||
[(set (match_operand:AVX256MODE 0 "register_operand" "=x,x,x")
|
||||
(vec_concat:AVX256MODE
|
||||
(match_operand:<avxhalfvecmode> 1 "nonimmediate_operand" "m,0,?x")
|
||||
(match_dup 1)))]
|
||||
"TARGET_AVX"
|
||||
"@
|
||||
vbroadcastf128\t{%1, %0|%0, %1}
|
||||
vinsertf128\t{$1, %1, %0, %0|%0, %0, %1, 1}
|
||||
vperm2f128\t{$0, %t1, %t1, %0|%0, %t1, %t1, 0}"
|
||||
[(set_attr "type" "ssemov,sselog1,sselog1")
|
||||
(set_attr "prefix_extra" "1")
|
||||
(set_attr "length_immediate" "0,1,1")
|
||||
(set_attr "prefix" "vex")
|
||||
(set_attr "mode" "V4SF,V8SF,V8SF")])
|
||||
|
||||
;; Recognize broadcast as a vec_select as produced by builtin_vec_perm.
|
||||
;; If it so happens that the input is in memory, use vbroadcast.
|
||||
;; Otherwise use vpermilp (and in the case of 256-bit modes, vperm2f128).
|
||||
(define_insn "*avx_vperm_broadcast_v4sf"
|
||||
[(set (match_operand:V4SF 0 "register_operand" "=x,x,x")
|
||||
(vec_select:V4SF
|
||||
(match_operand:V4SF 1 "nonimmediate_operand" "m,o,x")
|
||||
(match_parallel 2 "avx_vbroadcast_operand"
|
||||
[(match_operand 3 "const_int_operand" "C,n,n")])))]
|
||||
"TARGET_AVX"
|
||||
{
|
||||
int elt = INTVAL (operands[3]);
|
||||
switch (which_alternative)
|
||||
{
|
||||
case 0:
|
||||
case 1:
|
||||
operands[1] = adjust_address_nv (operands[1], SFmode, elt * 4);
|
||||
return "vbroadcastss\t{%1, %0|%0, %1}";
|
||||
case 2:
|
||||
operands[2] = GEN_INT (elt * 0x55);
|
||||
return "vpermilps\t{%2, %1, %0|%0, %1, %2}";
|
||||
default:
|
||||
gcc_unreachable ();
|
||||
}
|
||||
}
|
||||
[(set_attr "type" "ssemov,ssemov,sselog1")
|
||||
(set_attr "prefix_extra" "1")
|
||||
(set_attr "length_immediate" "0,0,1")
|
||||
(set_attr "prefix" "vex")
|
||||
(set_attr "mode" "SF,SF,V4SF")])
|
||||
|
||||
(define_insn_and_split "*avx_vperm_broadcast_<mode>"
|
||||
[(set (match_operand:AVX256MODEF2P 0 "register_operand" "=x,x,x")
|
||||
(vec_select:AVX256MODEF2P
|
||||
(match_operand:AVX256MODEF2P 1 "nonimmediate_operand" "m,o,?x")
|
||||
(match_parallel 2 "avx_vbroadcast_operand"
|
||||
[(match_operand 3 "const_int_operand" "C,n,n")])))]
|
||||
"TARGET_AVX"
|
||||
"#"
|
||||
"&& reload_completed"
|
||||
[(set (match_dup 0) (vec_duplicate:AVX256MODEF2P (match_dup 1)))]
|
||||
{
|
||||
rtx op0 = operands[0], op1 = operands[1];
|
||||
int elt = INTVAL (operands[3]);
|
||||
|
||||
if (REG_P (op1))
|
||||
{
|
||||
int mask;
|
||||
|
||||
/* Shuffle element we care about into all elements of the 128-bit lane.
|
||||
The other lane gets shuffled too, but we don't care. */
|
||||
if (<MODE>mode == V4DFmode)
|
||||
mask = (elt & 1 ? 15 : 0);
|
||||
else
|
||||
mask = (elt & 3) * 0x55;
|
||||
emit_insn (gen_avx_vpermil<mode> (op0, op1, GEN_INT (mask)));
|
||||
|
||||
/* Shuffle the lane we care about into both lanes of the dest. */
|
||||
mask = (elt / (<ssescalarnum> / 2)) * 0x11;
|
||||
emit_insn (gen_avx_vperm2f128<mode>3 (op0, op0, op0, GEN_INT (mask)));
|
||||
DONE;
|
||||
}
|
||||
|
||||
operands[1] = adjust_address_nv (op1, <avxscalarmode>mode,
|
||||
elt * GET_MODE_SIZE (<avxscalarmode>mode));
|
||||
})
|
||||
|
||||
(define_expand "avx_vpermil<mode>"
|
||||
[(set (match_operand:AVXMODEFDP 0 "register_operand" "")
|
||||
(vec_select:AVXMODEFDP
|
||||
@ -11989,58 +12215,6 @@
|
||||
(set_attr "prefix" "vex")
|
||||
(set_attr "mode" "V8SF")])
|
||||
|
||||
(define_insn "avx_vbroadcasts<avxmodesuffixf2c><avxmodesuffix>"
|
||||
[(set (match_operand:AVXMODEF4P 0 "register_operand" "=x")
|
||||
(vec_concat:AVXMODEF4P
|
||||
(vec_concat:<avxhalfvecmode>
|
||||
(match_operand:<avxscalarmode> 1 "memory_operand" "m")
|
||||
(match_dup 1))
|
||||
(vec_concat:<avxhalfvecmode>
|
||||
(match_dup 1)
|
||||
(match_dup 1))))]
|
||||
"TARGET_AVX"
|
||||
"vbroadcasts<avxmodesuffixf2c>\t{%1, %0|%0, %1}"
|
||||
[(set_attr "type" "ssemov")
|
||||
(set_attr "prefix_extra" "1")
|
||||
(set_attr "prefix" "vex")
|
||||
(set_attr "mode" "<avxscalarmode>")])
|
||||
|
||||
(define_insn "avx_vbroadcastss256"
|
||||
[(set (match_operand:V8SF 0 "register_operand" "=x")
|
||||
(vec_concat:V8SF
|
||||
(vec_concat:V4SF
|
||||
(vec_concat:V2SF
|
||||
(match_operand:SF 1 "memory_operand" "m")
|
||||
(match_dup 1))
|
||||
(vec_concat:V2SF
|
||||
(match_dup 1)
|
||||
(match_dup 1)))
|
||||
(vec_concat:V4SF
|
||||
(vec_concat:V2SF
|
||||
(match_dup 1)
|
||||
(match_dup 1))
|
||||
(vec_concat:V2SF
|
||||
(match_dup 1)
|
||||
(match_dup 1)))))]
|
||||
"TARGET_AVX"
|
||||
"vbroadcastss\t{%1, %0|%0, %1}"
|
||||
[(set_attr "type" "ssemov")
|
||||
(set_attr "prefix_extra" "1")
|
||||
(set_attr "prefix" "vex")
|
||||
(set_attr "mode" "SF")])
|
||||
|
||||
(define_insn "avx_vbroadcastf128_p<avxmodesuffixf2c>256"
|
||||
[(set (match_operand:AVX256MODEF2P 0 "register_operand" "=x")
|
||||
(vec_concat:AVX256MODEF2P
|
||||
(match_operand:<avxhalfvecmode> 1 "memory_operand" "m")
|
||||
(match_dup 1)))]
|
||||
"TARGET_AVX"
|
||||
"vbroadcastf128\t{%1, %0|%0, %1}"
|
||||
[(set_attr "type" "ssemov")
|
||||
(set_attr "prefix_extra" "1")
|
||||
(set_attr "prefix" "vex")
|
||||
(set_attr "mode" "V4SF")])
|
||||
|
||||
(define_expand "avx_vinsertf128<mode>"
|
||||
[(match_operand:AVX256MODE 0 "register_operand" "")
|
||||
(match_operand:AVX256MODE 1 "register_operand" "")
|
||||
|
Loading…
x
Reference in New Issue
Block a user