mirror of
git://gcc.gnu.org/git/gcc.git
synced 2025-04-05 14:21:28 +08:00
re PR middle-end/68542 (10% 481.wrf performance regression)
gcc/ 2016-02-02 Yuri Rumyantsev <ysrumyan@gmail.com> PR middle-end/68542 * config/i386/i386.c (ix86_expand_branch): Add support for conditional branch with vector comparison. * config/i386/sse.md (VI48_AVX): New mode iterator. (define_expand "cbranch<mode>4): Add support for conditional branch with vector comparison. * tree-vect-loop.c (optimize_mask_stores): New function. * tree-vect-stmts.c (vectorizable_mask_load_store): Initialize has_mask_store field of vect_info. * tree-vectorizer.c (vectorize_loops): Invoke optimaze_mask_stores for vectorized loops having masked stores after vec_info destroy. * tree-vectorizer.h (loop_vec_info): Add new has_mask_store field and correspondent macros. (optimize_mask_stores): Add prototype. gcc/testsuite 2016-02-02 Yuri Rumyantsev <ysrumyan@gmail.com> PR middle-end/68542 * gcc.dg/vect/vect-mask-store-move-1.c: New test. * gcc.target/i386/avx2-vect-mask-store-move1.c: New test. From-SVN: r233068
This commit is contained in:
parent
65c98fdec7
commit
2d4dc2233b
@ -1,3 +1,20 @@
|
||||
2016-02-02 Yuri Rumyantsev <ysrumyan@gmail.com>
|
||||
|
||||
PR middle-end/68542
|
||||
* config/i386/i386.c (ix86_expand_branch): Add support for conditional
|
||||
branch with vector comparison.
|
||||
* config/i386/sse.md (VI48_AVX): New mode iterator.
|
||||
(define_expand "cbranch<mode>4): Add support for conditional branch
|
||||
with vector comparison.
|
||||
* tree-vect-loop.c (optimize_mask_stores): New function.
|
||||
* tree-vect-stmts.c (vectorizable_mask_load_store): Initialize
|
||||
has_mask_store field of vect_info.
|
||||
* tree-vectorizer.c (vectorize_loops): Invoke optimaze_mask_stores for
|
||||
vectorized loops having masked stores after vec_info destroy.
|
||||
* tree-vectorizer.h (loop_vec_info): Add new has_mask_store field and
|
||||
correspondent macros.
|
||||
(optimize_mask_stores): Add prototype.
|
||||
|
||||
2016-02-02 Alan Modra <amodra@gmail.com>
|
||||
|
||||
PR target/69548
|
||||
|
@ -21684,6 +21684,30 @@ ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
|
||||
machine_mode mode = GET_MODE (op0);
|
||||
rtx tmp;
|
||||
|
||||
/* Handle special case - vector comparsion with boolean result, transform
|
||||
it using ptest instruction. */
|
||||
if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
|
||||
{
|
||||
rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
|
||||
machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
|
||||
|
||||
gcc_assert (code == EQ || code == NE);
|
||||
/* Generate XOR since we can't check that one operand is zero vector. */
|
||||
tmp = gen_reg_rtx (mode);
|
||||
emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
|
||||
tmp = gen_lowpart (p_mode, tmp);
|
||||
emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
|
||||
gen_rtx_UNSPEC (CCmode,
|
||||
gen_rtvec (2, tmp, tmp),
|
||||
UNSPEC_PTEST)));
|
||||
tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
|
||||
tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
|
||||
gen_rtx_LABEL_REF (VOIDmode, label),
|
||||
pc_rtx);
|
||||
emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
|
||||
return;
|
||||
}
|
||||
|
||||
switch (mode)
|
||||
{
|
||||
case SFmode:
|
||||
|
@ -305,6 +305,10 @@
|
||||
(V8SI "TARGET_AVX") (V4DI "TARGET_AVX")
|
||||
(V8SF "TARGET_AVX") (V4DF"TARGET_AVX")])
|
||||
|
||||
(define_mode_iterator VI48_AVX
|
||||
[V4SI V2DI
|
||||
(V8SI "TARGET_AVX") (V4DI "TARGET_AVX")])
|
||||
|
||||
(define_mode_iterator VI8
|
||||
[(V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX") V2DI])
|
||||
|
||||
@ -18225,6 +18229,23 @@
|
||||
(match_operand:<avx512fmaskmode> 2 "register_operand")))]
|
||||
"TARGET_AVX512BW")
|
||||
|
||||
(define_expand "cbranch<mode>4"
|
||||
[(set (reg:CC FLAGS_REG)
|
||||
(compare:CC (match_operand:VI48_AVX 1 "register_operand")
|
||||
(match_operand:VI48_AVX 2 "nonimmediate_operand")))
|
||||
(set (pc) (if_then_else
|
||||
(match_operator 0 "bt_comparison_operator"
|
||||
[(reg:CC FLAGS_REG) (const_int 0)])
|
||||
(label_ref (match_operand 3))
|
||||
(pc)))]
|
||||
"TARGET_SSE4_1"
|
||||
{
|
||||
ix86_expand_branch (GET_CODE (operands[0]),
|
||||
operands[1], operands[2], operands[3]);
|
||||
DONE;
|
||||
})
|
||||
|
||||
|
||||
(define_insn_and_split "avx_<castmode><avxsizesuffix>_<castmode>"
|
||||
[(set (match_operand:AVX256MODE2P 0 "nonimmediate_operand" "=x,m")
|
||||
(unspec:AVX256MODE2P
|
||||
|
@ -1,3 +1,9 @@
|
||||
2016-02-02 Yuri Rumyantsev <ysrumyan@gmail.com>
|
||||
|
||||
PR middle-end/68542
|
||||
* gcc.dg/vect/vect-mask-store-move-1.c: New test.
|
||||
* gcc.target/i386/avx2-vect-mask-store-move1.c: New test.
|
||||
|
||||
2016-02-02 Alan Modra <amodra@gmail.com>
|
||||
|
||||
PR target/69548
|
||||
|
19
gcc/testsuite/gcc.dg/vect/vect-mask-store-move-1.c
Normal file
19
gcc/testsuite/gcc.dg/vect/vect-mask-store-move-1.c
Normal file
@ -0,0 +1,19 @@
|
||||
/* { dg-do compile } */
|
||||
/* { dg-options "-O3" } */
|
||||
/* { dg-additional-options "-mavx2" { target { i?86-*-* x86_64-*-* } } } */
|
||||
|
||||
#define N 256
|
||||
int p1[N], p2[N], p3[N];
|
||||
int c[N];
|
||||
void foo (int n)
|
||||
{
|
||||
int i;
|
||||
for (i=0; i<n; i++)
|
||||
if (c[i])
|
||||
{
|
||||
p1[i] += 1;
|
||||
p2[i] = p3[i] +2;
|
||||
}
|
||||
}
|
||||
|
||||
/* { dg-final { scan-tree-dump-times "Move stmt to created bb" 6 "vect" } } */
|
79
gcc/testsuite/gcc.target/i386/avx2-vect-mask-store-move1.c
Normal file
79
gcc/testsuite/gcc.target/i386/avx2-vect-mask-store-move1.c
Normal file
@ -0,0 +1,79 @@
|
||||
/* { dg-options "-O3 -mavx2 -fdump-tree-vect-details" } */
|
||||
/* { dg-require-effective-target avx2 } */
|
||||
|
||||
#include "avx2-check.h"
|
||||
#define N 32
|
||||
int *p1, *p2, *p3;
|
||||
int c[N];
|
||||
int p1ref[N], p2ref[N];
|
||||
|
||||
__attribute__((noinline, noclone)) void foo (int n)
|
||||
{
|
||||
int i;
|
||||
for (i=0; i<n; i++)
|
||||
if (c[i])
|
||||
{
|
||||
p1[i] += 1;
|
||||
p2[i] = p3[i] +2;
|
||||
}
|
||||
}
|
||||
|
||||
void init ()
|
||||
{
|
||||
p1ref[0]=1; p2ref[0]=2;
|
||||
p1ref[1]=3; p2ref[1]=5;
|
||||
p1ref[2]=5; p2ref[2]=8;
|
||||
p1ref[3]=7; p2ref[3]=11;
|
||||
p1ref[4]=9; p2ref[4]=14;
|
||||
p1ref[5]=11; p2ref[5]=17;
|
||||
p1ref[6]=13; p2ref[6]=20;
|
||||
p1ref[7]=15; p2ref[7]=23;
|
||||
p1ref[8]=16; p2ref[8]=8;
|
||||
p1ref[9]=18; p2ref[9]=9;
|
||||
p1ref[10]=20; p2ref[10]=10;
|
||||
p1ref[11]=22; p2ref[11]=11;
|
||||
p1ref[12]=24; p2ref[12]=12;
|
||||
p1ref[13]=26; p2ref[13]=13;
|
||||
p1ref[14]=28; p2ref[14]=14;
|
||||
p1ref[15]=30; p2ref[15]=15;
|
||||
p1ref[16]=33; p2ref[16]=50;
|
||||
p1ref[17]=35; p2ref[17]=53;
|
||||
p1ref[18]=37; p2ref[18]=56;
|
||||
p1ref[19]=39; p2ref[19]=59;
|
||||
p1ref[20]=41; p2ref[20]=62;
|
||||
p1ref[21]=43; p2ref[21]=65;
|
||||
p1ref[22]=45; p2ref[22]=68;
|
||||
p1ref[23]=47; p2ref[23]=71;
|
||||
p1ref[24]=48; p2ref[24]=24;
|
||||
p1ref[25]=50; p2ref[25]=25;
|
||||
p1ref[26]=52; p2ref[26]=26;
|
||||
p1ref[27]=54; p2ref[27]=27;
|
||||
p1ref[28]=56; p2ref[28]=28;
|
||||
p1ref[29]=58; p2ref[29]=29;
|
||||
p1ref[30]=60; p2ref[30]=30;
|
||||
p1ref[31]=62; p2ref[31]=31;
|
||||
}
|
||||
|
||||
static void
|
||||
avx2_test (void)
|
||||
{
|
||||
int * P = malloc (N * 3 * sizeof (int));
|
||||
int i;
|
||||
|
||||
p1 = &P[0];
|
||||
p2 = &P[N];
|
||||
p3 = &P[2 * N];
|
||||
for (i=0; i<N; i++) {
|
||||
p1[i] = i + i;
|
||||
p3[i] = i * 3;
|
||||
p2[i] = i;
|
||||
c[i] = (i >> 3) & 1? 0: 1;
|
||||
}
|
||||
init ();
|
||||
foo (N);
|
||||
for (i=0; i<N;i++)
|
||||
if (p1[i] != p1ref[i] || p2[i] != p2ref[i])
|
||||
abort ();
|
||||
}
|
||||
|
||||
/* { dg-final { scan-tree-dump-times "Move stmt to created bb" 6 "vect" } } */
|
@ -6938,3 +6938,195 @@ vect_transform_loop (loop_vec_info loop_vinfo)
|
||||
vect_free_slp_instance (instance);
|
||||
LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
|
||||
}
|
||||
|
||||
/* The code below is trying to perform simple optimization - revert
|
||||
if-conversion for masked stores, i.e. if the mask of a store is zero
|
||||
do not perform it and all stored value producers also if possible.
|
||||
For example,
|
||||
for (i=0; i<n; i++)
|
||||
if (c[i])
|
||||
{
|
||||
p1[i] += 1;
|
||||
p2[i] = p3[i] +2;
|
||||
}
|
||||
this transformation will produce the following semi-hammock:
|
||||
|
||||
if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
|
||||
{
|
||||
vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
|
||||
vect__12.22_172 = vect__11.19_170 + vect_cst__171;
|
||||
MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
|
||||
vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
|
||||
vect__19.28_184 = vect__18.25_182 + vect_cst__183;
|
||||
MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
|
||||
}
|
||||
*/
|
||||
|
||||
void
|
||||
optimize_mask_stores (struct loop *loop)
|
||||
{
|
||||
basic_block *bbs = get_loop_body (loop);
|
||||
unsigned nbbs = loop->num_nodes;
|
||||
unsigned i;
|
||||
basic_block bb;
|
||||
gimple_stmt_iterator gsi;
|
||||
gimple *stmt, *stmt1 = NULL;
|
||||
auto_vec<gimple *> worklist;
|
||||
|
||||
vect_location = find_loop_location (loop);
|
||||
/* Pick up all masked stores in loop if any. */
|
||||
for (i = 0; i < nbbs; i++)
|
||||
{
|
||||
bb = bbs[i];
|
||||
for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
|
||||
gsi_next (&gsi))
|
||||
{
|
||||
stmt = gsi_stmt (gsi);
|
||||
if (is_gimple_call (stmt)
|
||||
&& gimple_call_internal_p (stmt)
|
||||
&& gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
|
||||
worklist.safe_push (stmt);
|
||||
}
|
||||
}
|
||||
|
||||
free (bbs);
|
||||
if (worklist.is_empty ())
|
||||
return;
|
||||
|
||||
/* Loop has masked stores. */
|
||||
while (!worklist.is_empty ())
|
||||
{
|
||||
gimple *last, *last_store;
|
||||
edge e, efalse;
|
||||
tree mask;
|
||||
basic_block store_bb, join_bb;
|
||||
gimple_stmt_iterator gsi_to;
|
||||
tree vdef, new_vdef;
|
||||
gphi *phi;
|
||||
tree vectype;
|
||||
tree zero;
|
||||
|
||||
last = worklist.pop ();
|
||||
mask = gimple_call_arg (last, 2);
|
||||
bb = gimple_bb (last);
|
||||
/* Create new bb. */
|
||||
e = split_block (bb, last);
|
||||
join_bb = e->dest;
|
||||
store_bb = create_empty_bb (bb);
|
||||
add_bb_to_loop (store_bb, loop);
|
||||
e->flags = EDGE_TRUE_VALUE;
|
||||
efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
|
||||
/* Put STORE_BB to likely part. */
|
||||
efalse->probability = PROB_UNLIKELY;
|
||||
store_bb->frequency = PROB_ALWAYS - EDGE_FREQUENCY (efalse);
|
||||
make_edge (store_bb, join_bb, EDGE_FALLTHRU);
|
||||
if (dom_info_available_p (CDI_DOMINATORS))
|
||||
set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
|
||||
if (dump_enabled_p ())
|
||||
dump_printf_loc (MSG_NOTE, vect_location,
|
||||
"Create new block %d to sink mask stores.",
|
||||
store_bb->index);
|
||||
/* Create vector comparison with boolean result. */
|
||||
vectype = TREE_TYPE (mask);
|
||||
zero = build_zero_cst (vectype);
|
||||
stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
|
||||
gsi = gsi_last_bb (bb);
|
||||
gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
|
||||
/* Create new PHI node for vdef of the last masked store:
|
||||
.MEM_2 = VDEF <.MEM_1>
|
||||
will be converted to
|
||||
.MEM.3 = VDEF <.MEM_1>
|
||||
and new PHI node will be created in join bb
|
||||
.MEM_2 = PHI <.MEM_1, .MEM_3>
|
||||
*/
|
||||
vdef = gimple_vdef (last);
|
||||
new_vdef = make_ssa_name (gimple_vop (cfun), last);
|
||||
gimple_set_vdef (last, new_vdef);
|
||||
phi = create_phi_node (vdef, join_bb);
|
||||
add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
|
||||
|
||||
/* Put all masked stores with the same mask to STORE_BB if possible. */
|
||||
while (true)
|
||||
{
|
||||
gimple_stmt_iterator gsi_from;
|
||||
/* Move masked store to STORE_BB. */
|
||||
last_store = last;
|
||||
gsi = gsi_for_stmt (last);
|
||||
gsi_from = gsi;
|
||||
/* Shift GSI to the previous stmt for further traversal. */
|
||||
gsi_prev (&gsi);
|
||||
gsi_to = gsi_start_bb (store_bb);
|
||||
gsi_move_before (&gsi_from, &gsi_to);
|
||||
/* Setup GSI_TO to the non-empty block start. */
|
||||
gsi_to = gsi_start_bb (store_bb);
|
||||
if (dump_enabled_p ())
|
||||
{
|
||||
dump_printf_loc (MSG_NOTE, vect_location,
|
||||
"Move stmt to created bb\n");
|
||||
dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
|
||||
}
|
||||
/* Move all stored value producers if possible. */
|
||||
while (!gsi_end_p (gsi))
|
||||
{
|
||||
tree lhs;
|
||||
imm_use_iterator imm_iter;
|
||||
use_operand_p use_p;
|
||||
bool res;
|
||||
stmt1 = gsi_stmt (gsi);
|
||||
/* Do not consider statements writing to memory. */
|
||||
if (gimple_vdef (stmt1))
|
||||
break;
|
||||
gsi_from = gsi;
|
||||
gsi_prev (&gsi);
|
||||
lhs = gimple_get_lhs (stmt1);
|
||||
if (!lhs)
|
||||
break;
|
||||
|
||||
/* LHS of vectorized stmt must be SSA_NAME. */
|
||||
if (TREE_CODE (lhs) != SSA_NAME)
|
||||
break;
|
||||
|
||||
/* Skip scalar statements. */
|
||||
if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
|
||||
continue;
|
||||
|
||||
/* Check that LHS does not have uses outside of STORE_BB. */
|
||||
res = true;
|
||||
FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
|
||||
{
|
||||
gimple *use_stmt;
|
||||
use_stmt = USE_STMT (use_p);
|
||||
if (gimple_bb (use_stmt) != store_bb)
|
||||
{
|
||||
res = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!res)
|
||||
break;
|
||||
|
||||
if (gimple_vuse (stmt1)
|
||||
&& gimple_vuse (stmt1) != gimple_vuse (last_store))
|
||||
break;
|
||||
|
||||
/* Can move STMT1 to STORE_BB. */
|
||||
if (dump_enabled_p ())
|
||||
{
|
||||
dump_printf_loc (MSG_NOTE, vect_location,
|
||||
"Move stmt to created bb\n");
|
||||
dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
|
||||
}
|
||||
gsi_move_before (&gsi_from, &gsi_to);
|
||||
/* Shift GSI_TO for further insertion. */
|
||||
gsi_prev (&gsi_to);
|
||||
}
|
||||
/* Put other masked stores with the same mask to STORE_BB. */
|
||||
if (worklist.is_empty ()
|
||||
|| gimple_call_arg (worklist.last (), 2) != mask
|
||||
|| worklist.last () != stmt1)
|
||||
break;
|
||||
last = worklist.pop ();
|
||||
}
|
||||
add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
|
||||
}
|
||||
}
|
||||
|
@ -2023,6 +2023,7 @@ vectorizable_mask_load_store (gimple *stmt, gimple_stmt_iterator *gsi,
|
||||
{
|
||||
tree vec_rhs = NULL_TREE, vec_mask = NULL_TREE;
|
||||
prev_stmt_info = NULL;
|
||||
LOOP_VINFO_HAS_MASK_STORE (loop_vinfo) = true;
|
||||
for (i = 0; i < ncopies; i++)
|
||||
{
|
||||
unsigned align, misalign;
|
||||
|
@ -604,12 +604,18 @@ vectorize_loops (void)
|
||||
for (i = 1; i < vect_loops_num; i++)
|
||||
{
|
||||
loop_vec_info loop_vinfo;
|
||||
bool has_mask_store;
|
||||
|
||||
loop = get_loop (cfun, i);
|
||||
if (!loop)
|
||||
continue;
|
||||
loop_vinfo = (loop_vec_info) loop->aux;
|
||||
has_mask_store = false;
|
||||
if (loop_vinfo)
|
||||
has_mask_store = LOOP_VINFO_HAS_MASK_STORE (loop_vinfo);
|
||||
destroy_loop_vec_info (loop_vinfo, true);
|
||||
if (has_mask_store)
|
||||
optimize_mask_stores (loop);
|
||||
loop->aux = NULL;
|
||||
}
|
||||
|
||||
|
@ -333,6 +333,9 @@ typedef struct _loop_vec_info : public vec_info {
|
||||
loop version without if-conversion. */
|
||||
struct loop *scalar_loop;
|
||||
|
||||
/* Mark loops having masked stores. */
|
||||
bool has_mask_store;
|
||||
|
||||
} *loop_vec_info;
|
||||
|
||||
/* Access Functions. */
|
||||
@ -368,6 +371,7 @@ typedef struct _loop_vec_info : public vec_info {
|
||||
#define LOOP_VINFO_PEELING_FOR_NITER(L) (L)->peeling_for_niter
|
||||
#define LOOP_VINFO_NO_DATA_DEPENDENCIES(L) (L)->no_data_dependencies
|
||||
#define LOOP_VINFO_SCALAR_LOOP(L) (L)->scalar_loop
|
||||
#define LOOP_VINFO_HAS_MASK_STORE(L) (L)->has_mask_store
|
||||
#define LOOP_VINFO_SCALAR_ITERATION_COST(L) (L)->scalar_cost_vec
|
||||
#define LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST(L) (L)->single_scalar_iteration_cost
|
||||
|
||||
@ -1013,6 +1017,7 @@ extern void vect_get_vec_defs (tree, tree, gimple *, vec<tree> *,
|
||||
vec<tree> *, slp_tree, int);
|
||||
extern tree vect_gen_perm_mask_any (tree, const unsigned char *);
|
||||
extern tree vect_gen_perm_mask_checked (tree, const unsigned char *);
|
||||
extern void optimize_mask_stores (struct loop*);
|
||||
|
||||
/* In tree-vect-data-refs.c. */
|
||||
extern bool vect_can_force_dr_alignment_p (const_tree, unsigned int);
|
||||
|
Loading…
x
Reference in New Issue
Block a user