2
0
mirror of git://gcc.gnu.org/git/gcc.git synced 2025-04-15 13:01:06 +08:00

arm: Implement vec_perm and vec_perm_const for NEON.

* config/arm/arm.c (arm_vectorize_vec_perm_const_ok,
	TARGET_VECTORIZE_VEC_PERM_CONST_OK, neon_split_vcombine, MAX_VECT_LEN,
	struct expand_vec_perm_d, arm_expand_vec_perm_1, arm_expand_vec_perm,
	arm_evpc_neon_vuzp, arm_evpc_neon_vzip, arm_evpc_neon_vrev,
	arm_evpc_neon_vtrn, arm_evpc_neon_vtbl, arm_expand_vec_perm_const_1,
	arm_expand_vec_perm_const): New.
	* config/arm/arm-protos.h: Update.
	* config/arm/neon.md (UNSPEC_VCONCAT): New.
	(*neon_vswp<VDQX>): New.
	(neon_vcombine<VDX>): Use neon_split_vcombine.
	(neon_vtbl1v16qi, neon_vtbl2v16qi, neon_vcombinev16qi): New.
	* config/arm/vec-common.md (vec_perm_const<VALL>): New.
	(vec_perm<VE>): New.

testsuite/
	* lib/target-supports.exp (check_effective_target_vect_perm,
	check_effective_target_vect_perm_byte,
	check_effective_target_vect_perm_short): Enable for arm neon.

From-SVN: r183051
This commit is contained in:
Richard Henderson 2012-01-09 20:47:26 -08:00 committed by Richard Henderson
parent 18f0fe6b98
commit b440f32451
7 changed files with 729 additions and 47 deletions

@ -1,3 +1,19 @@
2012-01-10 Richard Henderson <rth@redhat.com>
* config/arm/arm.c (arm_vectorize_vec_perm_const_ok,
TARGET_VECTORIZE_VEC_PERM_CONST_OK, neon_split_vcombine, MAX_VECT_LEN,
struct expand_vec_perm_d, arm_expand_vec_perm_1, arm_expand_vec_perm,
arm_evpc_neon_vuzp, arm_evpc_neon_vzip, arm_evpc_neon_vrev,
arm_evpc_neon_vtrn, arm_evpc_neon_vtbl, arm_expand_vec_perm_const_1,
arm_expand_vec_perm_const): New.
* config/arm/arm-protos.h: Update.
* config/arm/neon.md (UNSPEC_VCONCAT): New.
(*neon_vswp<VDQX>): New.
(neon_vcombine<VDX>): Use neon_split_vcombine.
(neon_vtbl1v16qi, neon_vtbl2v16qi, neon_vcombinev16qi): New.
* config/arm/vec-common.md (vec_perm_const<VALL>): New.
(vec_perm<VE>): New.
2012-01-10 Richard Henderson <rth@redhat.com>
* config/arm/arm.c (arm_gen_compare_reg): Add scratch argument;

@ -86,6 +86,7 @@ extern void neon_emit_pair_result_insn (enum machine_mode,
rtx (*) (rtx, rtx, rtx, rtx),
rtx, rtx, rtx);
extern void neon_disambiguate_copy (rtx *, rtx *, rtx *, unsigned int);
extern void neon_split_vcombine (rtx op[3]);
extern enum reg_class coproc_secondary_reload_class (enum machine_mode, rtx,
bool);
extern bool arm_tls_referenced_p (rtx);
@ -243,4 +244,7 @@ extern const struct tune_params *current_tune;
extern int vfp3_const_double_for_fract_bits (rtx);
#endif /* RTX_CODE */
extern void arm_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel);
extern bool arm_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel);
#endif /* ! GCC_ARM_PROTOS_H */

@ -269,6 +269,9 @@ static unsigned int arm_autovectorize_vector_sizes (void);
static int arm_default_branch_cost (bool, bool);
static int arm_cortex_a5_branch_cost (bool, bool);
static bool arm_vectorize_vec_perm_const_ok (enum machine_mode vmode,
const unsigned char *sel);
/* Table of machine attributes. */
static const struct attribute_spec arm_attribute_table[] =
@ -612,6 +615,10 @@ static const struct attribute_spec arm_attribute_table[] =
#define TARGET_PREFERRED_RENAME_CLASS \
arm_preferred_rename_class
#undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
#define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
arm_vectorize_vec_perm_const_ok
struct gcc_target targetm = TARGET_INITIALIZER;
/* Obstack for minipool constant handling. */
@ -20915,6 +20922,53 @@ neon_disambiguate_copy (rtx *operands, rtx *dest, rtx *src, unsigned int count)
}
}
/* Split operands into moves from op[1] + op[2] into op[0]. */
void
neon_split_vcombine (rtx operands[3])
{
unsigned int dest = REGNO (operands[0]);
unsigned int src1 = REGNO (operands[1]);
unsigned int src2 = REGNO (operands[2]);
enum machine_mode halfmode = GET_MODE (operands[1]);
unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
rtx destlo, desthi;
if (src1 == dest && src2 == dest + halfregs)
return;
/* Preserve register attributes for variable tracking. */
destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
GET_MODE_SIZE (halfmode));
/* Special case of reversed high/low parts. Use VSWP. */
if (src2 == dest && src1 == dest + halfregs)
{
rtx x = gen_rtx_SET (VOIDmode, destlo, operands[1]);
rtx y = gen_rtx_SET (VOIDmode, desthi, operands[2]);
emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, x, y)));
return;
}
if (!reg_overlap_mentioned_p (operands[2], destlo))
{
/* Try to avoid unnecessary moves if part of the result
is in the right place already. */
if (src1 != dest)
emit_move_insn (destlo, operands[1]);
if (src2 != dest + halfregs)
emit_move_insn (desthi, operands[2]);
}
else
{
if (src2 != dest + halfregs)
emit_move_insn (desthi, operands[2]);
if (src1 != dest)
emit_move_insn (destlo, operands[1]);
}
}
/* Expand an expression EXP that calls a built-in function,
with result going to TARGET if that's convenient
(and in mode MODE if that's convenient).
@ -24642,7 +24696,7 @@ vfp3_const_double_for_fract_bits (rtx operand)
}
return 0;
}
/* Emit a memory barrier around an atomic sequence according to MODEL. */
static void
@ -24945,6 +24999,515 @@ arm_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
arm_post_atomic_barrier (model);
}
#define MAX_VECT_LEN 16
struct expand_vec_perm_d
{
rtx target, op0, op1;
unsigned char perm[MAX_VECT_LEN];
enum machine_mode vmode;
unsigned char nelt;
bool one_vector_p;
bool testing_p;
};
/* Generate a variable permutation. */
static void
arm_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
{
enum machine_mode vmode = GET_MODE (target);
bool one_vector_p = rtx_equal_p (op0, op1);
gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
gcc_checking_assert (GET_MODE (op0) == vmode);
gcc_checking_assert (GET_MODE (op1) == vmode);
gcc_checking_assert (GET_MODE (sel) == vmode);
gcc_checking_assert (TARGET_NEON);
if (one_vector_p)
{
if (vmode == V8QImode)
emit_insn (gen_neon_vtbl1v8qi (target, op0, sel));
else
emit_insn (gen_neon_vtbl1v16qi (target, op0, sel));
}
else
{
rtx pair;
if (vmode == V8QImode)
{
pair = gen_reg_rtx (V16QImode);
emit_insn (gen_neon_vcombinev8qi (pair, op0, op1));
pair = gen_lowpart (TImode, pair);
emit_insn (gen_neon_vtbl2v8qi (target, pair, sel));
}
else
{
pair = gen_reg_rtx (OImode);
emit_insn (gen_neon_vcombinev16qi (pair, op0, op1));
emit_insn (gen_neon_vtbl2v16qi (target, pair, sel));
}
}
}
void
arm_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
{
enum machine_mode vmode = GET_MODE (target);
unsigned int i, nelt = GET_MODE_NUNITS (vmode);
bool one_vector_p = rtx_equal_p (op0, op1);
rtx rmask[MAX_VECT_LEN], mask;
/* TODO: ARM's VTBL indexing is little-endian. In order to handle GCC's
numbering of elements for big-endian, we must reverse the order. */
gcc_checking_assert (!BYTES_BIG_ENDIAN);
/* The VTBL instruction does not use a modulo index, so we must take care
of that ourselves. */
mask = GEN_INT (one_vector_p ? nelt - 1 : 2 * nelt - 1);
for (i = 0; i < nelt; ++i)
rmask[i] = mask;
mask = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rmask));
sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
arm_expand_vec_perm_1 (target, op0, op1, sel);
}
/* Generate or test for an insn that supports a constant permutation. */
/* Recognize patterns for the VUZP insns. */
static bool
arm_evpc_neon_vuzp (struct expand_vec_perm_d *d)
{
unsigned int i, odd, mask, nelt = d->nelt;
rtx out0, out1, in0, in1, x;
rtx (*gen)(rtx, rtx, rtx, rtx);
if (GET_MODE_UNIT_SIZE (d->vmode) >= 8)
return false;
/* Note that these are little-endian tests. Adjust for big-endian later. */
if (d->perm[0] == 0)
odd = 0;
else if (d->perm[0] == 1)
odd = 1;
else
return false;
mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
for (i = 0; i < nelt; i++)
{
unsigned elt = (i * 2 + odd) & mask;
if (d->perm[i] != elt)
return false;
}
/* Success! */
if (d->testing_p)
return true;
switch (d->vmode)
{
case V16QImode: gen = gen_neon_vuzpv16qi_internal; break;
case V8QImode: gen = gen_neon_vuzpv8qi_internal; break;
case V8HImode: gen = gen_neon_vuzpv8hi_internal; break;
case V4HImode: gen = gen_neon_vuzpv4hi_internal; break;
case V4SImode: gen = gen_neon_vuzpv4si_internal; break;
case V2SImode: gen = gen_neon_vuzpv2si_internal; break;
case V2SFmode: gen = gen_neon_vuzpv2sf_internal; break;
case V4SFmode: gen = gen_neon_vuzpv4sf_internal; break;
default:
gcc_unreachable ();
}
in0 = d->op0;
in1 = d->op1;
if (BYTES_BIG_ENDIAN)
{
x = in0, in0 = in1, in1 = x;
odd = !odd;
}
out0 = d->target;
out1 = gen_reg_rtx (d->vmode);
if (odd)
x = out0, out0 = out1, out1 = x;
emit_insn (gen (out0, in0, in1, out1));
return true;
}
/* Recognize patterns for the VZIP insns. */
static bool
arm_evpc_neon_vzip (struct expand_vec_perm_d *d)
{
unsigned int i, high, mask, nelt = d->nelt;
rtx out0, out1, in0, in1, x;
rtx (*gen)(rtx, rtx, rtx, rtx);
if (GET_MODE_UNIT_SIZE (d->vmode) >= 8)
return false;
/* Note that these are little-endian tests. Adjust for big-endian later. */
high = nelt / 2;
if (d->perm[0] == high)
;
else if (d->perm[0] == 0)
high = 0;
else
return false;
mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
for (i = 0; i < nelt / 2; i++)
{
unsigned elt = (i + high) & mask;
if (d->perm[i * 2] != elt)
return false;
elt = (elt + nelt) & mask;
if (d->perm[i * 2 + 1] != elt)
return false;
}
/* Success! */
if (d->testing_p)
return true;
switch (d->vmode)
{
case V16QImode: gen = gen_neon_vzipv16qi_internal; break;
case V8QImode: gen = gen_neon_vzipv8qi_internal; break;
case V8HImode: gen = gen_neon_vzipv8hi_internal; break;
case V4HImode: gen = gen_neon_vzipv4hi_internal; break;
case V4SImode: gen = gen_neon_vzipv4si_internal; break;
case V2SImode: gen = gen_neon_vzipv2si_internal; break;
case V2SFmode: gen = gen_neon_vzipv2sf_internal; break;
case V4SFmode: gen = gen_neon_vzipv4sf_internal; break;
default:
gcc_unreachable ();
}
in0 = d->op0;
in1 = d->op1;
if (BYTES_BIG_ENDIAN)
{
x = in0, in0 = in1, in1 = x;
high = !high;
}
out0 = d->target;
out1 = gen_reg_rtx (d->vmode);
if (high)
x = out0, out0 = out1, out1 = x;
emit_insn (gen (out0, in0, in1, out1));
return true;
}
/* Recognize patterns for the VREV insns. */
static bool
arm_evpc_neon_vrev (struct expand_vec_perm_d *d)
{
unsigned int i, j, diff, nelt = d->nelt;
rtx (*gen)(rtx, rtx, rtx);
if (!d->one_vector_p)
return false;
diff = d->perm[0];
switch (diff)
{
case 7:
switch (d->vmode)
{
case V16QImode: gen = gen_neon_vrev64v16qi; break;
case V8QImode: gen = gen_neon_vrev64v8qi; break;
default:
return false;
}
break;
case 3:
switch (d->vmode)
{
case V16QImode: gen = gen_neon_vrev32v16qi; break;
case V8QImode: gen = gen_neon_vrev32v8qi; break;
case V8HImode: gen = gen_neon_vrev64v8hi; break;
case V4HImode: gen = gen_neon_vrev64v4hi; break;
default:
return false;
}
break;
case 1:
switch (d->vmode)
{
case V16QImode: gen = gen_neon_vrev16v16qi; break;
case V8QImode: gen = gen_neon_vrev16v8qi; break;
case V8HImode: gen = gen_neon_vrev32v8hi; break;
case V4HImode: gen = gen_neon_vrev32v4hi; break;
case V4SImode: gen = gen_neon_vrev64v4si; break;
case V2SImode: gen = gen_neon_vrev64v2si; break;
case V4SFmode: gen = gen_neon_vrev64v4sf; break;
case V2SFmode: gen = gen_neon_vrev64v2sf; break;
default:
return false;
}
break;
default:
return false;
}
for (i = 0; i < nelt; i += diff)
for (j = 0; j <= diff; j += 1)
if (d->perm[i + j] != i + diff - j)
return false;
/* Success! */
if (d->testing_p)
return true;
/* ??? The third operand is an artifact of the builtin infrastructure
and is ignored by the actual instruction. */
emit_insn (gen (d->target, d->op0, const0_rtx));
return true;
}
/* Recognize patterns for the VTRN insns. */
static bool
arm_evpc_neon_vtrn (struct expand_vec_perm_d *d)
{
unsigned int i, odd, mask, nelt = d->nelt;
rtx out0, out1, in0, in1, x;
rtx (*gen)(rtx, rtx, rtx, rtx);
if (GET_MODE_UNIT_SIZE (d->vmode) >= 8)
return false;
/* Note that these are little-endian tests. Adjust for big-endian later. */
if (d->perm[0] == 0)
odd = 0;
else if (d->perm[0] == 1)
odd = 1;
else
return false;
mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
for (i = 0; i < nelt; i += 2)
{
if (d->perm[i] != i + odd)
return false;
if (d->perm[i + 1] != ((i + nelt + odd) & mask))
return false;
}
/* Success! */
if (d->testing_p)
return true;
switch (d->vmode)
{
case V16QImode: gen = gen_neon_vtrnv16qi_internal; break;
case V8QImode: gen = gen_neon_vtrnv8qi_internal; break;
case V8HImode: gen = gen_neon_vtrnv8hi_internal; break;
case V4HImode: gen = gen_neon_vtrnv4hi_internal; break;
case V4SImode: gen = gen_neon_vtrnv4si_internal; break;
case V2SImode: gen = gen_neon_vtrnv2si_internal; break;
case V2SFmode: gen = gen_neon_vtrnv2sf_internal; break;
case V4SFmode: gen = gen_neon_vtrnv4sf_internal; break;
default:
gcc_unreachable ();
}
in0 = d->op0;
in1 = d->op1;
if (BYTES_BIG_ENDIAN)
{
x = in0, in0 = in1, in1 = x;
odd = !odd;
}
out0 = d->target;
out1 = gen_reg_rtx (d->vmode);
if (odd)
x = out0, out0 = out1, out1 = x;
emit_insn (gen (out0, in0, in1, out1));
return true;
}
/* The NEON VTBL instruction is a fully variable permuation that's even
stronger than what we expose via VEC_PERM_EXPR. What it doesn't do
is mask the index operand as VEC_PERM_EXPR requires. Therefore we
can do slightly better by expanding this as a constant where we don't
have to apply a mask. */
static bool
arm_evpc_neon_vtbl (struct expand_vec_perm_d *d)
{
rtx rperm[MAX_VECT_LEN], sel;
enum machine_mode vmode = d->vmode;
unsigned int i, nelt = d->nelt;
/* TODO: ARM's VTBL indexing is little-endian. In order to handle GCC's
numbering of elements for big-endian, we must reverse the order. */
if (BYTES_BIG_ENDIAN)
return false;
if (d->testing_p)
return true;
/* Generic code will try constant permutation twice. Once with the
original mode and again with the elements lowered to QImode.
So wait and don't do the selector expansion ourselves. */
if (vmode != V8QImode && vmode != V16QImode)
return false;
for (i = 0; i < nelt; ++i)
rperm[i] = GEN_INT (d->perm[i]);
sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
sel = force_reg (vmode, sel);
arm_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
return true;
}
static bool
arm_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
{
/* The pattern matching functions above are written to look for a small
number to begin the sequence (0, 1, N/2). If we begin with an index
from the second operand, we can swap the operands. */
if (d->perm[0] >= d->nelt)
{
unsigned i, nelt = d->nelt;
rtx x;
for (i = 0; i < nelt; ++i)
d->perm[i] = (d->perm[i] + nelt) & (2 * nelt - 1);
x = d->op0;
d->op0 = d->op1;
d->op1 = x;
}
if (TARGET_NEON)
{
if (arm_evpc_neon_vuzp (d))
return true;
if (arm_evpc_neon_vzip (d))
return true;
if (arm_evpc_neon_vrev (d))
return true;
if (arm_evpc_neon_vtrn (d))
return true;
return arm_evpc_neon_vtbl (d);
}
return false;
}
/* Expand a vec_perm_const pattern. */
bool
arm_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
{
struct expand_vec_perm_d d;
int i, nelt, which;
d.target = target;
d.op0 = op0;
d.op1 = op1;
d.vmode = GET_MODE (target);
gcc_assert (VECTOR_MODE_P (d.vmode));
d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
d.testing_p = false;
for (i = which = 0; i < nelt; ++i)
{
rtx e = XVECEXP (sel, 0, i);
int ei = INTVAL (e) & (2 * nelt - 1);
which |= (ei < nelt ? 1 : 2);
d.perm[i] = ei;
}
switch (which)
{
default:
gcc_unreachable();
case 3:
d.one_vector_p = false;
if (!rtx_equal_p (op0, op1))
break;
/* The elements of PERM do not suggest that only the first operand
is used, but both operands are identical. Allow easier matching
of the permutation by folding the permutation into the single
input vector. */
/* FALLTHRU */
case 2:
for (i = 0; i < nelt; ++i)
d.perm[i] &= nelt - 1;
d.op0 = op1;
d.one_vector_p = true;
break;
case 1:
d.op1 = op0;
d.one_vector_p = true;
break;
}
return arm_expand_vec_perm_const_1 (&d);
}
/* Implement TARGET_VECTORIZE_VEC_PERM_CONST_OK. */
static bool
arm_vectorize_vec_perm_const_ok (enum machine_mode vmode,
const unsigned char *sel)
{
struct expand_vec_perm_d d;
unsigned int i, nelt, which;
bool ret;
d.vmode = vmode;
d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
d.testing_p = true;
memcpy (d.perm, sel, nelt);
/* Categorize the set of elements in the selector. */
for (i = which = 0; i < nelt; ++i)
{
unsigned char e = d.perm[i];
gcc_assert (e < 2 * nelt);
which |= (e < nelt ? 1 : 2);
}
/* For all elements from second vector, fold the elements to first. */
if (which == 2)
for (i = 0; i < nelt; ++i)
d.perm[i] -= nelt;
/* Check whether the mask can be applied to the vector type. */
d.one_vector_p = (which != 3);
d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
if (!d.one_vector_p)
d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
start_sequence ();
ret = arm_expand_vec_perm_const_1 (&d);
end_sequence ();
return ret;
}
#include "gt-arm.h"

@ -1,5 +1,6 @@
;; ARM NEON coprocessor Machine Description
;; Copyright (C) 2006, 2007, 2008, 2009, 2010 Free Software Foundation, Inc.
;; Copyright (C) 2006, 2007, 2008, 2009, 2010, 2012
;; Free Software Foundation, Inc.
;; Written by CodeSourcery.
;;
;; This file is part of GCC.
@ -35,6 +36,7 @@
UNSPEC_VCGE
UNSPEC_VCGT
UNSPEC_VCLS
UNSPEC_VCONCAT
UNSPEC_VCVT
UNSPEC_VCVT_N
UNSPEC_VEXT
@ -2860,6 +2862,20 @@
DONE;
})
; Disabled before reload because we don't want combine doing something silly,
; but used by the post-reload expansion of neon_vcombine.
(define_insn "*neon_vswp<mode>"
[(set (match_operand:VDQX 0 "s_register_operand" "+w")
(match_operand:VDQX 1 "s_register_operand" "+w"))
(set (match_dup 1) (match_dup 0))]
"TARGET_NEON && reload_completed"
"vswp\t%<V_reg>1, %<V_reg>2"
[(set (attr "neon_type")
(if_then_else (match_test "<Is_d_reg>")
(const_string "neon_bp_simple")
(const_string "neon_bp_2cycle")))]
)
;; In this insn, operand 1 should be low, and operand 2 the high part of the
;; dest vector.
;; FIXME: A different implementation of this builtin could make it much
@ -2867,48 +2883,19 @@
;; it so that the reg allocator puts things in the right places magically
;; instead). Lack of subregs for vectors makes that tricky though, I think.
(define_insn "neon_vcombine<mode>"
(define_insn_and_split "neon_vcombine<mode>"
[(set (match_operand:<V_DOUBLE> 0 "s_register_operand" "=w")
(vec_concat:<V_DOUBLE> (match_operand:VDX 1 "s_register_operand" "w")
(match_operand:VDX 2 "s_register_operand" "w")))]
(vec_concat:<V_DOUBLE>
(match_operand:VDX 1 "s_register_operand" "w")
(match_operand:VDX 2 "s_register_operand" "w")))]
"TARGET_NEON"
"#"
"&& reload_completed"
[(const_int 0)]
{
int dest = REGNO (operands[0]);
int src1 = REGNO (operands[1]);
int src2 = REGNO (operands[2]);
rtx destlo;
if (src1 == dest && src2 == dest + 2)
return "";
else if (src2 == dest && src1 == dest + 2)
/* Special case of reversed high/low parts. */
return "vswp\t%P1, %P2";
destlo = gen_rtx_REG (<MODE>mode, dest);
if (!reg_overlap_mentioned_p (operands[2], destlo))
{
/* Try to avoid unnecessary moves if part of the result is in the right
place already. */
if (src1 != dest)
output_asm_insn ("vmov\t%e0, %P1", operands);
if (src2 != dest + 2)
output_asm_insn ("vmov\t%f0, %P2", operands);
}
else
{
if (src2 != dest + 2)
output_asm_insn ("vmov\t%f0, %P2", operands);
if (src1 != dest)
output_asm_insn ("vmov\t%e0, %P1", operands);
}
return "";
}
;; We set the neon_type attribute based on the vmov instructions above.
[(set_attr "length" "8")
(set_attr "neon_type" "neon_bp_simple")]
)
neon_split_vcombine (operands);
DONE;
})
(define_expand "neon_vget_high<mode>"
[(match_operand:<V_HALF> 0 "s_register_operand")
@ -3920,6 +3907,83 @@
[(set_attr "neon_type" "neon_bp_3cycle")]
)
;; These three are used by the vec_perm infrastructure for V16QImode.
(define_insn_and_split "neon_vtbl1v16qi"
[(set (match_operand:V16QI 0 "s_register_operand" "=&w")
(unspec:V16QI [(match_operand:V16QI 1 "s_register_operand" "w")
(match_operand:V16QI 2 "s_register_operand" "w")]
UNSPEC_VTBL))]
"TARGET_NEON"
"#"
"&& reload_completed"
[(const_int 0)]
{
rtx op0, op1, op2, part0, part2;
unsigned ofs;
op0 = operands[0];
op1 = gen_lowpart (TImode, operands[1]);
op2 = operands[2];
ofs = subreg_lowpart_offset (V8QImode, V16QImode);
part0 = simplify_subreg (V8QImode, op0, V16QImode, ofs);
part2 = simplify_subreg (V8QImode, op2, V16QImode, ofs);
emit_insn (gen_neon_vtbl2v8qi (part0, op1, part2));
ofs = subreg_highpart_offset (V8QImode, V16QImode);
part0 = simplify_subreg (V8QImode, op0, V16QImode, ofs);
part2 = simplify_subreg (V8QImode, op2, V16QImode, ofs);
emit_insn (gen_neon_vtbl2v8qi (part0, op1, part2));
DONE;
})
(define_insn_and_split "neon_vtbl2v16qi"
[(set (match_operand:V16QI 0 "s_register_operand" "=&w")
(unspec:V16QI [(match_operand:OI 1 "s_register_operand" "w")
(match_operand:V16QI 2 "s_register_operand" "w")]
UNSPEC_VTBL))]
"TARGET_NEON"
"#"
"&& reload_completed"
[(const_int 0)]
{
rtx op0, op1, op2, part0, part2;
unsigned ofs;
op0 = operands[0];
op1 = operands[1];
op2 = operands[2];
ofs = subreg_lowpart_offset (V8QImode, V16QImode);
part0 = simplify_subreg (V8QImode, op0, V16QImode, ofs);
part2 = simplify_subreg (V8QImode, op2, V16QImode, ofs);
emit_insn (gen_neon_vtbl2v8qi (part0, op1, part2));
ofs = subreg_highpart_offset (V8QImode, V16QImode);
part0 = simplify_subreg (V8QImode, op0, V16QImode, ofs);
part2 = simplify_subreg (V8QImode, op2, V16QImode, ofs);
emit_insn (gen_neon_vtbl2v8qi (part0, op1, part2));
DONE;
})
;; ??? Logically we should extend the regular neon_vcombine pattern to
;; handle quad-word input modes, producing octa-word output modes. But
;; that requires us to add support for octa-word vector modes in moves.
;; That seems overkill for this one use in vec_perm.
(define_insn_and_split "neon_vcombinev16qi"
[(set (match_operand:OI 0 "s_register_operand" "=w")
(unspec:OI [(match_operand:V16QI 1 "s_register_operand" "w")
(match_operand:V16QI 2 "s_register_operand" "w")]
UNSPEC_VCONCAT))]
"TARGET_NEON"
"#"
"&& reload_completed"
[(const_int 0)]
{
neon_split_vcombine (operands);
DONE;
})
(define_insn "neon_vtbx1v8qi"
[(set (match_operand:V8QI 0 "s_register_operand" "=w")
(unspec:V8QI [(match_operand:V8QI 1 "s_register_operand" "0")

@ -1,5 +1,5 @@
;; Machine Description for shared bits common to IWMMXT and Neon.
;; Copyright (C) 2006, 2007, 2010 Free Software Foundation, Inc.
;; Copyright (C) 2006, 2007, 2010, 2012 Free Software Foundation, Inc.
;; Written by CodeSourcery.
;;
;; This file is part of GCC.
@ -108,3 +108,29 @@
|| (TARGET_REALLY_IWMMXT && VALID_IWMMXT_REG_MODE (<MODE>mode))"
{
})
(define_expand "vec_perm_const<mode>"
[(match_operand:VALL 0 "s_register_operand" "")
(match_operand:VALL 1 "s_register_operand" "")
(match_operand:VALL 2 "s_register_operand" "")
(match_operand:<V_cmp_result> 3 "" "")]
"TARGET_NEON
|| (TARGET_REALLY_IWMMXT && VALID_IWMMXT_REG_MODE (<MODE>mode))"
{
if (arm_expand_vec_perm_const (operands[0], operands[1],
operands[2], operands[3]))
DONE;
else
FAIL;
})
(define_expand "vec_perm<mode>"
[(match_operand:VE 0 "s_register_operand" "")
(match_operand:VE 1 "s_register_operand" "")
(match_operand:VE 2 "s_register_operand" "")
(match_operand:VE 3 "s_register_operand" "")]
"TARGET_NEON && !BYTES_BIG_ENDIAN"
{
arm_expand_vec_perm (operands[0], operands[1], operands[2], operands[3]);
DONE;
})

@ -1,3 +1,9 @@
2012-01-10 Richard Henderson <rth@redhat.com>
* lib/target-supports.exp (check_effective_target_vect_perm,
check_effective_target_vect_perm_byte,
check_effective_target_vect_perm_short): Enable for arm neon.
2012-01-09 Tobias Burnus <burnus@net-b.de>
PR fortran/46328

@ -1,5 +1,5 @@
# Copyright (C) 1999, 2001, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
# 2011 Free Software Foundation, Inc.
# 2011, 2012 Free Software Foundation, Inc.
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@ -2733,7 +2733,8 @@ proc check_effective_target_vect_perm { } {
verbose "check_effective_target_vect_perm: using cached result" 2
} else {
set et_vect_perm_saved 0
if { [istarget powerpc*-*-*]
if { [is-effective-target arm_neon_ok]
|| [istarget powerpc*-*-*]
|| [istarget spu-*-*]
|| [istarget i?86-*-*]
|| [istarget x86_64-*-*] } {
@ -2756,7 +2757,8 @@ proc check_effective_target_vect_perm_byte { } {
verbose "check_effective_target_vect_perm_byte: using cached result" 2
} else {
set et_vect_perm_byte_saved 0
if { [istarget powerpc*-*-*]
if { [is-effective-target arm_neon_ok]
|| [istarget powerpc*-*-*]
|| [istarget spu-*-*] } {
set et_vect_perm_byte_saved 1
}
@ -2777,7 +2779,8 @@ proc check_effective_target_vect_perm_short { } {
verbose "check_effective_target_vect_perm_short: using cached result" 2
} else {
set et_vect_perm_short_saved 0
if { [istarget powerpc*-*-*]
if { [is-effective-target arm_neon_ok]
|| [istarget powerpc*-*-*]
|| [istarget spu-*-*] } {
set et_vect_perm_short_saved 1
}