mirror of
git://gcc.gnu.org/git/gcc.git
synced 2025-04-19 21:31:42 +08:00
optimize permutes in SLP, remove vect_attempt_slp_rearrange_stmts
This introduces a permute optimization phase for SLP which is intended to cover the existing permute eliding for SLP reductions plus handling commonizing the easy cases. It currently uses graphds to compute a postorder on the reverse SLP graph and it handles all cases vect_attempt_slp_rearrange_stmts did (hopefully - I've adjusted most testcases that triggered it a few days ago). It restricts itself to move around bijective permutations to simplify things for now, mainly around constant nodes. As a prerequesite it makes the SLP graph cyclic (ugh). It looks like it would pay off to compute a PRE/POST order visit array once and elide all the recursive SLP graph walks and their visited hash-set. At least for the time where we do not change the SLP graph during such walk. I do not like using graphds too much but at least I don't have to re-implement yet another RPO walk, so maybe it isn't too bad. It now computes permute placement during iteration and thus should get cycles more obviously correct. Richard. 2020-10-06 Richard Biener <rguenther@suse.de> * tree-vect-data-refs.c (vect_slp_analyze_instance_dependence): Use SLP_TREE_REPRESENTATIVE. * tree-vectorizer.h (_slp_tree::vertex): New member used for graphds interfacing. * tree-vect-slp.c (vect_build_slp_tree_2): Allocate space for PHI SLP children. (vect_analyze_slp_backedges): New function filling in SLP node children for PHIs that correspond to backedge values. (vect_analyze_slp): Call vect_analyze_slp_backedges for the graph. (vect_slp_analyze_node_operations): Deal with a cyclic graph. (vect_schedule_slp_instance): Likewise. (vect_schedule_slp): Likewise. (slp_copy_subtree): Remove. (vect_slp_rearrange_stmts): Likewise. (vect_attempt_slp_rearrange_stmts): Likewise. (vect_slp_build_vertices): New functions. (vect_slp_permute): Likewise. (vect_slp_perms_eq): Likewise. (vect_optimize_slp): Remove special code to elide permutations with SLP reductions. Implement generic permute optimization. * gcc.dg/vect/bb-slp-50.c: New testcase. * gcc.dg/vect/bb-slp-51.c: Likewise.
This commit is contained in:
parent
7e7352b2ad
commit
126ed72b9f
20
gcc/testsuite/gcc.dg/vect/bb-slp-50.c
Normal file
20
gcc/testsuite/gcc.dg/vect/bb-slp-50.c
Normal file
@ -0,0 +1,20 @@
|
||||
/* { dg-do compile } */
|
||||
/* { dg-require-effective-target vect_double } */
|
||||
|
||||
double a[2];
|
||||
double b[2];
|
||||
double c[2];
|
||||
double d[2];
|
||||
double e[2];
|
||||
void foo(double x)
|
||||
{
|
||||
double tembc0 = b[1] + c[1];
|
||||
double tembc1 = b[0] + c[0];
|
||||
double temde0 = d[0] + e[1];
|
||||
double temde1 = d[1] + e[0];
|
||||
a[0] = tembc0 + temde0;
|
||||
a[1] = tembc1 + temde1;
|
||||
}
|
||||
|
||||
/* We should common the permutation on the tembc{0,1} operands. */
|
||||
/* { dg-final { scan-tree-dump-times "add new stmt: \[^\\n\\r\]* = VEC_PERM_EXPR" 2 "slp2" } } */
|
20
gcc/testsuite/gcc.dg/vect/bb-slp-51.c
Normal file
20
gcc/testsuite/gcc.dg/vect/bb-slp-51.c
Normal file
@ -0,0 +1,20 @@
|
||||
/* { dg-do compile } */
|
||||
/* { dg-require-effective-target vect_double } */
|
||||
|
||||
double a[2];
|
||||
double b[2];
|
||||
double c[2];
|
||||
double e[2];
|
||||
void foo(double x)
|
||||
{
|
||||
double tembc0 = b[1] + c[1];
|
||||
double tembc1 = b[0] + c[0];
|
||||
double temde0 = 5 + e[1];
|
||||
double temde1 = 11 + e[0];
|
||||
a[0] = tembc0 + temde0;
|
||||
a[1] = tembc1 + temde1;
|
||||
}
|
||||
|
||||
/* We should common the permutations on the tembc{0,1} and temde{0,1}
|
||||
operands. */
|
||||
/* { dg-final { scan-tree-dump-times "add new stmt: \[^\\r\\n\]* VEC_PERM_EXPR" 1 "slp2" } } */
|
@ -841,7 +841,7 @@ vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
|
||||
|
||||
/* The stores of this instance are at the root of the SLP tree. */
|
||||
slp_tree store = SLP_INSTANCE_TREE (instance);
|
||||
if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store)[0]))
|
||||
if (! STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (store)))
|
||||
store = NULL;
|
||||
|
||||
/* Verify we can sink stores to the vectorized stmt insert location. */
|
||||
|
@ -1256,8 +1256,8 @@ vect_build_slp_tree_2 (vec_info *vinfo,
|
||||
if (gimple_assign_rhs_code (stmt) == COND_EXPR)
|
||||
nops++;
|
||||
}
|
||||
else if (is_a <gphi *> (stmt_info->stmt))
|
||||
nops = 0;
|
||||
else if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
|
||||
nops = gimple_phi_num_args (phi);
|
||||
else
|
||||
return NULL;
|
||||
|
||||
@ -1294,7 +1294,7 @@ vect_build_slp_tree_2 (vec_info *vinfo,
|
||||
else
|
||||
return NULL;
|
||||
(*tree_size)++;
|
||||
node = vect_create_new_slp_node (stmts, 0);
|
||||
node = vect_create_new_slp_node (stmts, nops);
|
||||
SLP_TREE_VECTYPE (node) = vectype;
|
||||
return node;
|
||||
}
|
||||
@ -1812,188 +1812,6 @@ vect_mark_slp_stmts_relevant (slp_tree node)
|
||||
vect_mark_slp_stmts_relevant (node, visited);
|
||||
}
|
||||
|
||||
/* Copy the SLP subtree rooted at NODE. */
|
||||
|
||||
static slp_tree
|
||||
slp_copy_subtree (slp_tree node, hash_map<slp_tree, slp_tree> &map)
|
||||
{
|
||||
unsigned i;
|
||||
|
||||
bool existed_p;
|
||||
slp_tree ©_ref = map.get_or_insert (node, &existed_p);
|
||||
if (existed_p)
|
||||
return copy_ref;
|
||||
|
||||
copy_ref = new _slp_tree;
|
||||
slp_tree copy = copy_ref;
|
||||
SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
|
||||
SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
|
||||
SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
|
||||
SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
|
||||
copy->max_nunits = node->max_nunits;
|
||||
SLP_TREE_REF_COUNT (copy) = 0;
|
||||
if (SLP_TREE_SCALAR_STMTS (node).exists ())
|
||||
SLP_TREE_SCALAR_STMTS (copy) = SLP_TREE_SCALAR_STMTS (node).copy ();
|
||||
if (SLP_TREE_SCALAR_OPS (node).exists ())
|
||||
SLP_TREE_SCALAR_OPS (copy) = SLP_TREE_SCALAR_OPS (node).copy ();
|
||||
if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
|
||||
SLP_TREE_LOAD_PERMUTATION (copy) = SLP_TREE_LOAD_PERMUTATION (node).copy ();
|
||||
if (SLP_TREE_LANE_PERMUTATION (node).exists ())
|
||||
SLP_TREE_LANE_PERMUTATION (copy) = SLP_TREE_LANE_PERMUTATION (node).copy ();
|
||||
if (SLP_TREE_CHILDREN (node).exists ())
|
||||
SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node).copy ();
|
||||
gcc_assert (!SLP_TREE_VEC_STMTS (node).exists ());
|
||||
|
||||
slp_tree child;
|
||||
FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (copy), i, child)
|
||||
{
|
||||
SLP_TREE_CHILDREN (copy)[i] = slp_copy_subtree (child, map);
|
||||
SLP_TREE_REF_COUNT (SLP_TREE_CHILDREN (copy)[i])++;
|
||||
}
|
||||
return copy;
|
||||
}
|
||||
|
||||
/* Rearrange the statements of NODE according to PERMUTATION. */
|
||||
|
||||
static void
|
||||
vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
|
||||
vec<unsigned> permutation,
|
||||
hash_set<slp_tree> &visited)
|
||||
{
|
||||
unsigned int i;
|
||||
slp_tree child;
|
||||
|
||||
if (visited.add (node))
|
||||
return;
|
||||
|
||||
FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
|
||||
vect_slp_rearrange_stmts (child, group_size, permutation, visited);
|
||||
|
||||
if (SLP_TREE_SCALAR_STMTS (node).exists ())
|
||||
{
|
||||
gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
|
||||
vec<stmt_vec_info> tmp_stmts;
|
||||
tmp_stmts.create (group_size);
|
||||
tmp_stmts.quick_grow (group_size);
|
||||
stmt_vec_info stmt_info;
|
||||
FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
|
||||
tmp_stmts[permutation[i]] = stmt_info;
|
||||
SLP_TREE_SCALAR_STMTS (node).release ();
|
||||
SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
|
||||
}
|
||||
if (SLP_TREE_SCALAR_OPS (node).exists ())
|
||||
{
|
||||
gcc_assert (group_size == SLP_TREE_SCALAR_OPS (node).length ());
|
||||
vec<tree> tmp_ops;
|
||||
tmp_ops.create (group_size);
|
||||
tmp_ops.quick_grow (group_size);
|
||||
tree op;
|
||||
FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
|
||||
tmp_ops[permutation[i]] = op;
|
||||
SLP_TREE_SCALAR_OPS (node).release ();
|
||||
SLP_TREE_SCALAR_OPS (node) = tmp_ops;
|
||||
}
|
||||
if (SLP_TREE_LANE_PERMUTATION (node).exists ())
|
||||
{
|
||||
gcc_assert (group_size == SLP_TREE_LANE_PERMUTATION (node).length ());
|
||||
for (i = 0; i < group_size; ++i)
|
||||
SLP_TREE_LANE_PERMUTATION (node)[i].second
|
||||
= permutation[SLP_TREE_LANE_PERMUTATION (node)[i].second];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Attempt to reorder stmts in a reduction chain so that we don't
|
||||
require any load permutation. Return true if that was possible,
|
||||
otherwise return false. */
|
||||
|
||||
static bool
|
||||
vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
|
||||
{
|
||||
unsigned int i, j;
|
||||
unsigned int lidx;
|
||||
slp_tree node, load;
|
||||
|
||||
if (SLP_INSTANCE_LOADS (slp_instn).is_empty ())
|
||||
return false;
|
||||
|
||||
/* Compare all the permutation sequences to the first one. We know
|
||||
that at least one load is permuted. */
|
||||
node = SLP_INSTANCE_LOADS (slp_instn)[0];
|
||||
if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
|
||||
return false;
|
||||
unsigned int group_size = SLP_TREE_LOAD_PERMUTATION (node).length ();
|
||||
for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
|
||||
{
|
||||
if (!SLP_TREE_LOAD_PERMUTATION (load).exists ()
|
||||
|| SLP_TREE_LOAD_PERMUTATION (load).length () != group_size)
|
||||
return false;
|
||||
FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (load), j, lidx)
|
||||
if (lidx != SLP_TREE_LOAD_PERMUTATION (node)[j])
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Check that the loads in the first sequence are different and there
|
||||
are no gaps between them and that there is an actual permutation. */
|
||||
bool any_permute = false;
|
||||
auto_sbitmap load_index (group_size);
|
||||
bitmap_clear (load_index);
|
||||
FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
|
||||
{
|
||||
if (lidx != i)
|
||||
any_permute = true;
|
||||
if (lidx >= group_size)
|
||||
return false;
|
||||
if (bitmap_bit_p (load_index, lidx))
|
||||
return false;
|
||||
|
||||
bitmap_set_bit (load_index, lidx);
|
||||
}
|
||||
if (!any_permute)
|
||||
return false;
|
||||
for (i = 0; i < group_size; i++)
|
||||
if (!bitmap_bit_p (load_index, i))
|
||||
return false;
|
||||
|
||||
/* This permutation is valid for reduction. Since the order of the
|
||||
statements in the nodes is not important unless they are memory
|
||||
accesses, we can rearrange the statements in all the nodes
|
||||
according to the order of the loads. */
|
||||
|
||||
/* We have to unshare the SLP tree we modify. */
|
||||
hash_map<slp_tree, slp_tree> map;
|
||||
slp_tree unshared = slp_copy_subtree (SLP_INSTANCE_TREE (slp_instn), map);
|
||||
vect_free_slp_tree (SLP_INSTANCE_TREE (slp_instn));
|
||||
SLP_TREE_REF_COUNT (unshared)++;
|
||||
SLP_INSTANCE_TREE (slp_instn) = unshared;
|
||||
FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
|
||||
SLP_INSTANCE_LOADS (slp_instn)[i] = *map.get (node);
|
||||
node = SLP_INSTANCE_LOADS (slp_instn)[0];
|
||||
|
||||
/* Do the actual re-arrangement. */
|
||||
hash_set<slp_tree> visited;
|
||||
vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
|
||||
node->load_permutation, visited);
|
||||
|
||||
/* We are done, no actual permutations need to be generated. */
|
||||
poly_uint64 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_instn);
|
||||
FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
|
||||
{
|
||||
stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
|
||||
first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
|
||||
/* But we have to keep those permutations that are required because
|
||||
of handling of gaps. */
|
||||
if (known_eq (unrolling_factor, 1U)
|
||||
|| (group_size == DR_GROUP_SIZE (first_stmt_info)
|
||||
&& DR_GROUP_GAP (first_stmt_info) == 0))
|
||||
SLP_TREE_LOAD_PERMUTATION (node).release ();
|
||||
else
|
||||
for (j = 0; j < SLP_TREE_LOAD_PERMUTATION (node).length (); ++j)
|
||||
SLP_TREE_LOAD_PERMUTATION (node)[j] = j;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Gather loads in the SLP graph NODE and populate the INST loads array. */
|
||||
|
||||
@ -2458,6 +2276,53 @@ vect_analyze_slp_instance (vec_info *vinfo,
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Fill in backedge SLP children in the SLP graph. */
|
||||
|
||||
static void
|
||||
vect_analyze_slp_backedges (vec_info *vinfo, slp_tree node,
|
||||
scalar_stmts_to_slp_tree_map_t *bst_map,
|
||||
hash_set<slp_tree> &visited)
|
||||
{
|
||||
if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
|
||||
|| visited.add (node))
|
||||
return;
|
||||
|
||||
slp_tree child;
|
||||
unsigned i;
|
||||
FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
|
||||
if (child)
|
||||
vect_analyze_slp_backedges (vinfo, child, bst_map, visited);
|
||||
|
||||
if (gphi *phi = dyn_cast <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
|
||||
for (unsigned i = 0; i < gimple_phi_num_args (phi); ++i)
|
||||
{
|
||||
auto_vec<stmt_vec_info, 64> stmts;
|
||||
unsigned j;
|
||||
stmt_vec_info phi_info;
|
||||
FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, phi_info)
|
||||
{
|
||||
tree def = gimple_phi_arg_def (as_a <gphi *>(phi_info->stmt), i);
|
||||
stmt_vec_info def_info = vinfo->lookup_def (def);
|
||||
if (!def_info)
|
||||
break;
|
||||
stmts.safe_push (vect_stmt_to_vectorize (def_info));
|
||||
}
|
||||
if (j != SLP_TREE_LANES (node))
|
||||
continue;
|
||||
slp_tree *edge_def = bst_map->get (stmts);
|
||||
if (edge_def)
|
||||
{
|
||||
/* ??? We're currently not recording non-backedge children
|
||||
of PHIs like external reduction initial values. Avoid
|
||||
NULL entries in SLP_TREE_CHILDREN for those and thus
|
||||
for now simply only record backedge defs at a
|
||||
SLP_TREE_CHILDREN index possibly not matching that of
|
||||
the corresponding PHI argument index. */
|
||||
SLP_TREE_CHILDREN (node).quick_push (*edge_def);
|
||||
(*edge_def)->refcnt++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
|
||||
trees of packed scalar stmts if SLP is possible. */
|
||||
@ -2509,6 +2374,13 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
|
||||
max_tree_size);
|
||||
}
|
||||
|
||||
/* Fill in backedges. */
|
||||
slp_instance instance;
|
||||
hash_set<slp_tree> visited;
|
||||
FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
|
||||
vect_analyze_slp_backedges (vinfo, SLP_INSTANCE_TREE (instance),
|
||||
bst_map, visited);
|
||||
|
||||
/* The map keeps a reference on SLP nodes built, release that. */
|
||||
for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
|
||||
it != bst_map->end (); ++it)
|
||||
@ -2519,34 +2391,396 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
|
||||
return opt_result::success ();
|
||||
}
|
||||
|
||||
/* Fill the vertices and leafs vector with all nodes in the SLP graph. */
|
||||
|
||||
static void
|
||||
vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
|
||||
vec<slp_tree> &vertices, vec<int> &leafs)
|
||||
{
|
||||
unsigned i;
|
||||
slp_tree child;
|
||||
|
||||
if (visited.add (node))
|
||||
return;
|
||||
|
||||
node->vertex = vertices.length ();
|
||||
vertices.safe_push (node);
|
||||
if (SLP_TREE_CHILDREN (node).is_empty ())
|
||||
leafs.safe_push (node->vertex);
|
||||
else
|
||||
FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
|
||||
vect_slp_build_vertices (visited, child, vertices, leafs);
|
||||
}
|
||||
|
||||
/* Fill the vertices and leafs vector with all nodes in the SLP graph. */
|
||||
|
||||
static void
|
||||
vect_slp_build_vertices (vec_info *info, vec<slp_tree> &vertices,
|
||||
vec<int> &leafs)
|
||||
{
|
||||
hash_set<slp_tree> visited;
|
||||
unsigned i;
|
||||
slp_instance instance;
|
||||
FOR_EACH_VEC_ELT (info->slp_instances, i, instance)
|
||||
vect_slp_build_vertices (visited, SLP_INSTANCE_TREE (instance), vertices,
|
||||
leafs);
|
||||
}
|
||||
|
||||
/* Apply (reverse) bijectite PERM to VEC. */
|
||||
|
||||
template <class T>
|
||||
static void
|
||||
vect_slp_permute (vec<unsigned> perm,
|
||||
vec<T> &vec, bool reverse)
|
||||
{
|
||||
auto_vec<T, 64> saved;
|
||||
saved.create (vec.length ());
|
||||
for (unsigned i = 0; i < vec.length (); ++i)
|
||||
saved.quick_push (vec[i]);
|
||||
|
||||
if (reverse)
|
||||
{
|
||||
for (unsigned i = 0; i < vec.length (); ++i)
|
||||
vec[perm[i]] = saved[i];
|
||||
for (unsigned i = 0; i < vec.length (); ++i)
|
||||
gcc_assert (vec[perm[i]] == saved[i]);
|
||||
}
|
||||
else
|
||||
{
|
||||
for (unsigned i = 0; i < vec.length (); ++i)
|
||||
vec[i] = saved[perm[i]];
|
||||
for (unsigned i = 0; i < vec.length (); ++i)
|
||||
gcc_assert (vec[i] == saved[perm[i]]);
|
||||
}
|
||||
}
|
||||
|
||||
/* Return whether permutations PERM_A and PERM_B as recorded in the
|
||||
PERMS vector are equal. */
|
||||
|
||||
static bool
|
||||
vect_slp_perms_eq (const vec<vec<unsigned> > &perms,
|
||||
int perm_a, int perm_b)
|
||||
{
|
||||
return (perm_a == perm_b
|
||||
|| (perms[perm_a].length () == perms[perm_b].length ()
|
||||
&& memcmp (&perms[perm_a][0], &perms[perm_b][0],
|
||||
sizeof (unsigned) * perms[perm_a].length ()) == 0));
|
||||
}
|
||||
|
||||
/* Optimize the SLP graph of VINFO. */
|
||||
|
||||
void
|
||||
vect_optimize_slp (vec_info *vinfo)
|
||||
{
|
||||
/* Optimize permutations in SLP reductions. */
|
||||
slp_instance instance;
|
||||
unsigned i;
|
||||
FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
|
||||
{
|
||||
slp_tree node = SLP_INSTANCE_TREE (instance);
|
||||
stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
|
||||
/* Reduction (there are no data-refs in the root).
|
||||
In reduction chain the order of the loads is not important. */
|
||||
if (!STMT_VINFO_DATA_REF (stmt_info)
|
||||
&& !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
|
||||
&& !SLP_INSTANCE_ROOT_STMT (instance))
|
||||
vect_attempt_slp_rearrange_stmts (instance);
|
||||
}
|
||||
|
||||
/* Gather all loads in the SLP graph. */
|
||||
auto_vec<slp_tree> slp_loads;
|
||||
hash_set<slp_tree> visited;
|
||||
FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
|
||||
vect_gather_slp_loads (slp_loads, SLP_INSTANCE_TREE (instance),
|
||||
visited);
|
||||
if (vinfo->slp_instances.is_empty ())
|
||||
return;
|
||||
|
||||
slp_tree node;
|
||||
FOR_EACH_VEC_ELT (slp_loads, i, node)
|
||||
unsigned i;
|
||||
auto_vec<slp_tree> vertices;
|
||||
auto_vec<int> leafs;
|
||||
vect_slp_build_vertices (vinfo, vertices, leafs);
|
||||
|
||||
struct graph *slpg = new_graph (vertices.length ());
|
||||
FOR_EACH_VEC_ELT (vertices, i, node)
|
||||
{
|
||||
unsigned j;
|
||||
slp_tree child;
|
||||
FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
|
||||
add_edge (slpg, i, child->vertex);
|
||||
}
|
||||
|
||||
/* Compute (reverse) postorder on the inverted graph. */
|
||||
auto_vec<int> ipo;
|
||||
graphds_dfs (slpg, &leafs[0], leafs.length (), &ipo, false, NULL, NULL);
|
||||
|
||||
auto_sbitmap n_visited (vertices.length ());
|
||||
auto_sbitmap n_materialize (vertices.length ());
|
||||
auto_vec<int> n_perm (vertices.length ());
|
||||
auto_vec<vec<unsigned> > perms;
|
||||
|
||||
bitmap_clear (n_visited);
|
||||
bitmap_clear (n_materialize);
|
||||
n_perm.quick_grow_cleared (vertices.length ());
|
||||
perms.safe_push (vNULL); /* zero is no permute */
|
||||
|
||||
/* Produce initial permutations. */
|
||||
for (i = 0; i < leafs.length (); ++i)
|
||||
{
|
||||
int idx = leafs[i];
|
||||
slp_tree node = vertices[idx];
|
||||
|
||||
/* Handle externals and constants optimistically throughout the
|
||||
iteration. */
|
||||
if (SLP_TREE_DEF_TYPE (node) == vect_external_def
|
||||
|| SLP_TREE_DEF_TYPE (node) == vect_constant_def)
|
||||
continue;
|
||||
|
||||
/* Loads are the only thing generating permutes and leafs do not
|
||||
change across iterations. */
|
||||
bitmap_set_bit (n_visited, idx);
|
||||
if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
|
||||
continue;
|
||||
|
||||
/* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
|
||||
node unpermuted, record this permute. */
|
||||
stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
|
||||
if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
|
||||
continue;
|
||||
dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
|
||||
unsigned imin = DR_GROUP_SIZE (dr_stmt) + 1, imax = 0;
|
||||
bool any_permute = false;
|
||||
for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
|
||||
{
|
||||
unsigned idx = SLP_TREE_LOAD_PERMUTATION (node)[j];
|
||||
imin = MIN (imin, idx);
|
||||
imax = MAX (imax, idx);
|
||||
if (idx - SLP_TREE_LOAD_PERMUTATION (node)[0] != j)
|
||||
any_permute = true;
|
||||
}
|
||||
/* If there's no permute no need to split one out. */
|
||||
if (!any_permute)
|
||||
continue;
|
||||
/* If the span doesn't match we'd disrupt VF computation, avoid
|
||||
that for now. */
|
||||
if (imax - imin + 1 != SLP_TREE_LANES (node))
|
||||
continue;
|
||||
|
||||
/* For now only handle true permutes, like
|
||||
vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
|
||||
when permuting constants and invariants keeping the permute
|
||||
bijective. */
|
||||
auto_sbitmap load_index (SLP_TREE_LANES (node));
|
||||
bitmap_clear (load_index);
|
||||
for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
|
||||
bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
|
||||
unsigned j;
|
||||
for (j = 0; j < SLP_TREE_LANES (node); ++j)
|
||||
if (!bitmap_bit_p (load_index, j))
|
||||
break;
|
||||
if (j != SLP_TREE_LANES (node))
|
||||
continue;
|
||||
|
||||
vec<unsigned> perm = vNULL;
|
||||
perm.safe_grow (SLP_TREE_LANES (node), true);
|
||||
for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
|
||||
perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
|
||||
perms.safe_push (perm);
|
||||
n_perm[idx] = perms.length () - 1;
|
||||
}
|
||||
|
||||
/* Propagate permutes along the graph and compute materialization points. */
|
||||
bool changed;
|
||||
unsigned iteration = 0;
|
||||
do
|
||||
{
|
||||
changed = false;
|
||||
++iteration;
|
||||
|
||||
for (i = vertices.length (); i > 0 ; --i)
|
||||
{
|
||||
int idx = ipo[i-1];
|
||||
slp_tree node = vertices[idx];
|
||||
/* For leafs there's nothing to do - we've seeded permutes
|
||||
on those above. */
|
||||
if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
|
||||
continue;
|
||||
|
||||
bitmap_set_bit (n_visited, idx);
|
||||
|
||||
/* We cannot move a permute across a store. */
|
||||
if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))
|
||||
&& DR_IS_WRITE
|
||||
(STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
|
||||
continue;
|
||||
|
||||
int perm = -1;
|
||||
for (graph_edge *succ = slpg->vertices[idx].succ;
|
||||
succ; succ = succ->succ_next)
|
||||
{
|
||||
int succ_idx = succ->dest;
|
||||
/* Handle unvisited nodes optimistically. */
|
||||
/* ??? But for constants once we want to handle non-bijective
|
||||
permutes we have to verify the permute, when unifying lanes,
|
||||
will not unify different constants. For example see
|
||||
gcc.dg/vect/bb-slp-14.c for a case that would break. */
|
||||
if (!bitmap_bit_p (n_visited, succ_idx))
|
||||
continue;
|
||||
int succ_perm = n_perm[succ_idx];
|
||||
/* Once we materialize succs permutation its output lanes
|
||||
appear unpermuted to us. */
|
||||
if (bitmap_bit_p (n_materialize, succ_idx))
|
||||
succ_perm = 0;
|
||||
if (perm == -1)
|
||||
perm = succ_perm;
|
||||
else if (succ_perm == 0)
|
||||
{
|
||||
perm = 0;
|
||||
break;
|
||||
}
|
||||
else if (!vect_slp_perms_eq (perms, perm, succ_perm))
|
||||
{
|
||||
perm = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (perm == -1)
|
||||
/* Pick up pre-computed leaf values. */
|
||||
perm = n_perm[idx];
|
||||
else if (!vect_slp_perms_eq (perms, perm, n_perm[idx]))
|
||||
{
|
||||
if (iteration > 1)
|
||||
/* Make sure we eventually converge. */
|
||||
gcc_checking_assert (perm == 0);
|
||||
n_perm[idx] = perm;
|
||||
if (perm == 0)
|
||||
bitmap_clear_bit (n_materialize, idx);
|
||||
changed = true;
|
||||
}
|
||||
|
||||
if (perm == 0)
|
||||
continue;
|
||||
|
||||
/* Elide pruning at materialization points in the first
|
||||
iteration so every node was visited once at least. */
|
||||
if (iteration == 1)
|
||||
continue;
|
||||
|
||||
/* Decide on permute materialization. Look whether there's
|
||||
a use (pred) edge that is permuted differently than us.
|
||||
In that case mark ourselves so the permutation is applied. */
|
||||
bool all_preds_permuted = slpg->vertices[idx].pred != NULL;
|
||||
for (graph_edge *pred = slpg->vertices[idx].pred;
|
||||
pred; pred = pred->pred_next)
|
||||
{
|
||||
gcc_checking_assert (bitmap_bit_p (n_visited, pred->src));
|
||||
int pred_perm = n_perm[pred->src];
|
||||
if (!vect_slp_perms_eq (perms, perm, pred_perm))
|
||||
{
|
||||
all_preds_permuted = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!all_preds_permuted)
|
||||
{
|
||||
if (!bitmap_bit_p (n_materialize, idx))
|
||||
changed = true;
|
||||
bitmap_set_bit (n_materialize, idx);
|
||||
}
|
||||
}
|
||||
}
|
||||
while (changed || iteration == 1);
|
||||
|
||||
/* Materialize. */
|
||||
for (i = 0; i < vertices.length (); ++i)
|
||||
{
|
||||
int perm = n_perm[i];
|
||||
if (perm <= 0)
|
||||
continue;
|
||||
|
||||
slp_tree node = vertices[i];
|
||||
|
||||
/* First permute invariant/external original successors. */
|
||||
unsigned j;
|
||||
slp_tree child;
|
||||
FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
|
||||
{
|
||||
if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
|
||||
continue;
|
||||
|
||||
/* If the vector is uniform there's nothing to do. */
|
||||
if (vect_slp_tree_uniform_p (child))
|
||||
continue;
|
||||
|
||||
/* We can end up sharing some externals via two_operator
|
||||
handling. Be prepared to unshare those. */
|
||||
if (child->refcnt != 1)
|
||||
{
|
||||
gcc_assert (slpg->vertices[child->vertex].pred->pred_next);
|
||||
SLP_TREE_CHILDREN (node)[j] = child
|
||||
= vect_create_new_slp_node
|
||||
(SLP_TREE_SCALAR_OPS (child).copy ());
|
||||
}
|
||||
vect_slp_permute (perms[perm],
|
||||
SLP_TREE_SCALAR_OPS (child), true);
|
||||
}
|
||||
|
||||
if (bitmap_bit_p (n_materialize, i))
|
||||
{
|
||||
if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
|
||||
/* For loads simply drop the permutation, the load permutation
|
||||
already performs the desired permutation. */
|
||||
;
|
||||
else
|
||||
{
|
||||
if (dump_enabled_p ())
|
||||
dump_printf_loc (MSG_NOTE, vect_location,
|
||||
"inserting permute node in place of %p\n",
|
||||
node);
|
||||
|
||||
/* Make a copy of NODE and in-place change it to a
|
||||
VEC_PERM node to permute the lanes of the copy. */
|
||||
slp_tree copy = new _slp_tree;
|
||||
SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node);
|
||||
SLP_TREE_CHILDREN (node) = vNULL;
|
||||
SLP_TREE_SCALAR_STMTS (copy)
|
||||
= SLP_TREE_SCALAR_STMTS (node).copy ();
|
||||
vect_slp_permute (perms[perm],
|
||||
SLP_TREE_SCALAR_STMTS (copy), true);
|
||||
gcc_assert (!SLP_TREE_SCALAR_OPS (node).exists ());
|
||||
SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
|
||||
gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node).exists ());
|
||||
SLP_TREE_LANE_PERMUTATION (copy)
|
||||
= SLP_TREE_LANE_PERMUTATION (node);
|
||||
SLP_TREE_LANE_PERMUTATION (node) = vNULL;
|
||||
SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
|
||||
copy->refcnt = 1;
|
||||
copy->max_nunits = node->max_nunits;
|
||||
SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
|
||||
SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
|
||||
SLP_TREE_CODE (copy) = SLP_TREE_CODE (node);
|
||||
|
||||
/* Now turn NODE into a VEC_PERM. */
|
||||
SLP_TREE_CHILDREN (node).safe_push (copy);
|
||||
SLP_TREE_LANE_PERMUTATION (node).create (SLP_TREE_LANES (node));
|
||||
for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
|
||||
SLP_TREE_LANE_PERMUTATION (node)
|
||||
.quick_push (std::make_pair (0, perms[perm][j]));
|
||||
SLP_TREE_CODE (node) = VEC_PERM_EXPR;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Apply the reverse permutation to our stmts. */
|
||||
vect_slp_permute (perms[perm],
|
||||
SLP_TREE_SCALAR_STMTS (node), true);
|
||||
/* And to the load permutation, which we can simply
|
||||
make regular by design. */
|
||||
if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
|
||||
{
|
||||
/* ??? When we handle non-bijective permutes the idea
|
||||
is that we can force the load-permutation to be
|
||||
{ min, min + 1, min + 2, ... max }. But then the
|
||||
scalar defs might no longer match the lane content
|
||||
which means wrong-code with live lane vectorization.
|
||||
So we possibly have to have NULL entries for those. */
|
||||
vect_slp_permute (perms[perm],
|
||||
SLP_TREE_LOAD_PERMUTATION (node), true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Free the perms vector used for propagation. */
|
||||
while (!perms.is_empty ())
|
||||
perms.pop ().release ();
|
||||
free_graph (slpg);
|
||||
|
||||
|
||||
/* Now elide load permutations that are not necessary. */
|
||||
for (i = 0; i < leafs.length (); ++i)
|
||||
{
|
||||
node = vertices[i];
|
||||
if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
|
||||
continue;
|
||||
|
||||
@ -2593,7 +2827,8 @@ vect_optimize_slp (vec_info *vinfo)
|
||||
/* The load requires permutation when unrolling exposes
|
||||
a gap either because the group is larger than the SLP
|
||||
group-size or because there is a gap between the groups. */
|
||||
&& (known_eq (LOOP_VINFO_VECT_FACTOR (as_a <loop_vec_info> (vinfo)), 1U)
|
||||
&& (known_eq (LOOP_VINFO_VECT_FACTOR
|
||||
(as_a <loop_vec_info> (vinfo)), 1U)
|
||||
|| ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
|
||||
&& DR_GROUP_GAP (first_stmt_info) == 0)))
|
||||
{
|
||||
@ -2975,12 +3210,9 @@ vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
|
||||
return true;
|
||||
|
||||
/* If we already analyzed the exact same set of scalar stmts we're done.
|
||||
We share the generated vector stmts for those.
|
||||
The SLP graph is acyclic so not caching whether we failed or succeeded
|
||||
doesn't result in any issue since we throw away the lvisited set
|
||||
when we fail. */
|
||||
We share the generated vector stmts for those. */
|
||||
if (visited.contains (node)
|
||||
|| lvisited.contains (node))
|
||||
|| lvisited.add (node))
|
||||
return true;
|
||||
|
||||
bool res = true;
|
||||
@ -2993,12 +3225,10 @@ vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
|
||||
}
|
||||
|
||||
if (res)
|
||||
{
|
||||
res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
|
||||
cost_vec);
|
||||
if (res)
|
||||
lvisited.add (node);
|
||||
}
|
||||
res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
|
||||
cost_vec);
|
||||
if (!res)
|
||||
lvisited.remove (node);
|
||||
|
||||
/* When the node can be vectorized cost invariant nodes it references.
|
||||
This is not done in DFS order to allow the refering node
|
||||
@ -4685,7 +4915,8 @@ vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
|
||||
|
||||
static void
|
||||
vect_schedule_slp_instance (vec_info *vinfo,
|
||||
slp_tree node, slp_instance instance)
|
||||
slp_tree node, slp_instance instance,
|
||||
hash_set<slp_tree> &visited)
|
||||
{
|
||||
gimple_stmt_iterator si;
|
||||
int i;
|
||||
@ -4712,8 +4943,12 @@ vect_schedule_slp_instance (vec_info *vinfo,
|
||||
return;
|
||||
}
|
||||
|
||||
/* ??? If we'd have a way to mark backedges that would be cheaper. */
|
||||
if (visited.add (node))
|
||||
return;
|
||||
|
||||
FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
|
||||
vect_schedule_slp_instance (vinfo, child, instance);
|
||||
vect_schedule_slp_instance (vinfo, child, instance, visited);
|
||||
|
||||
gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
|
||||
SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
|
||||
@ -4737,14 +4972,13 @@ vect_schedule_slp_instance (vec_info *vinfo,
|
||||
last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
|
||||
si = gsi_for_stmt (last_stmt_info->stmt);
|
||||
}
|
||||
else if (SLP_TREE_CHILDREN (node).is_empty ())
|
||||
else if ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
|
||||
== cycle_phi_info_type)
|
||||
|| (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
|
||||
== induc_vec_info_type))
|
||||
{
|
||||
/* This happens for reduction and induction PHIs where we do not use the
|
||||
/* For reduction and induction PHIs we do not use the
|
||||
insertion iterator. */
|
||||
gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
|
||||
== cycle_phi_info_type
|
||||
|| (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
|
||||
== induc_vec_info_type));
|
||||
si = gsi_none ();
|
||||
}
|
||||
else
|
||||
@ -4957,6 +5191,7 @@ vect_schedule_slp (vec_info *vinfo, vec<slp_instance> slp_instances)
|
||||
slp_instance instance;
|
||||
unsigned int i;
|
||||
|
||||
hash_set<slp_tree> visited;
|
||||
FOR_EACH_VEC_ELT (slp_instances, i, instance)
|
||||
{
|
||||
slp_tree node = SLP_INSTANCE_TREE (instance);
|
||||
@ -4971,7 +5206,7 @@ vect_schedule_slp (vec_info *vinfo, vec<slp_instance> slp_instances)
|
||||
SLP_INSTANCE_TREE (instance));
|
||||
}
|
||||
/* Schedule the tree of INSTANCE. */
|
||||
vect_schedule_slp_instance (vinfo, node, instance);
|
||||
vect_schedule_slp_instance (vinfo, node, instance, visited);
|
||||
|
||||
if (SLP_INSTANCE_ROOT_STMT (instance))
|
||||
vectorize_slp_instance_root_stmt (node, instance);
|
||||
|
@ -161,6 +161,8 @@ struct _slp_tree {
|
||||
unsigned int lanes;
|
||||
/* The operation of this node. */
|
||||
enum tree_code code;
|
||||
|
||||
int vertex;
|
||||
};
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user