mirror of
git://gcc.gnu.org/git/gcc.git
synced 2025-03-24 04:10:29 +08:00
pass: Run cleanup passes before SLP [PR96789]
As the discussion in PR96789, we found that some scalar stmts which can be eliminated by some passes after SLP, but we still modeled their costs when trying to SLP, it could impact vectorizer's decision. One typical case is the case in PR96789 on target Power. As Richard suggested there, this patch is to introduce one pass called pre_slp_scalar_cleanup which has some secondary clean up passes, for now they are FRE and DSE. It introduces one new TODO flags group called pending TODO flags, unlike normal TODO flags, the pending TODO flags are passed down in the pipeline until one of its consumers can perform the requested action. Consumers should then clear the flags for the actions that they have taken. Soem compilation time statistics on all SPEC2017 INT bmks were collected on one Power9 machine for several option sets below: A1: -Ofast -funroll-loops A2: -O1 A3: -O1 -funroll-loops A4: -O2 A5: -O2 -funroll-loops the corresponding increment rate is trivial: A1 A2 A3 A4 A5 0.08% 0.00% -0.38% -0.10% -0.05% Bootstrapped/regtested on powerpc64le-linux-gnu P8. gcc/ChangeLog: PR tree-optimization/96789 * function.h (struct function): New member unsigned pending_TODOs. * passes.c (class pass_pre_slp_scalar_cleanup): New class. (make_pass_pre_slp_scalar_cleanup): New function. (pass_data_pre_slp_scalar_cleanup): New pass data. * passes.def: (pass_pre_slp_scalar_cleanup): New pass, add pass_fre and pass_dse as its children. * timevar.def (TV_SCALAR_CLEANUP): New timevar. * tree-pass.h (PENDING_TODO_force_next_scalar_cleanup): New pending TODO flag. (make_pass_pre_slp_scalar_cleanup): New declare. * tree-ssa-loop-ivcanon.c (tree_unroll_loops_completely_1): Once any outermost loop gets unrolled, flag cfun pending_TODOs PENDING_TODO_force_next_scalar_cleanup on. gcc/testsuite/ChangeLog: PR tree-optimization/96789 * gcc.dg/tree-ssa/ssa-dse-28.c: Adjust. * gcc.dg/tree-ssa/ssa-dse-29.c: Likewise. * gcc.dg/vect/bb-slp-41.c: Likewise. * gcc.dg/tree-ssa/pr96789.c: New test.
This commit is contained in:
parent
bd6ecbe48a
commit
f5e18dd9c7
@ -269,6 +269,13 @@ struct GTY(()) function {
|
||||
/* Value histograms attached to particular statements. */
|
||||
htab_t GTY((skip)) value_histograms;
|
||||
|
||||
/* Different from normal TODO_flags which are handled right at the
|
||||
beginning or the end of one pass execution, the pending_TODOs
|
||||
are passed down in the pipeline until one of its consumers can
|
||||
perform the requested action. Consumers should then clear the
|
||||
flags for the actions that they have taken. */
|
||||
unsigned int pending_TODOs;
|
||||
|
||||
/* For function.c. */
|
||||
|
||||
/* Points to the FUNCTION_DECL of this function. */
|
||||
|
47
gcc/passes.c
47
gcc/passes.c
@ -731,7 +731,54 @@ make_pass_late_compilation (gcc::context *ctxt)
|
||||
return new pass_late_compilation (ctxt);
|
||||
}
|
||||
|
||||
/* Pre-SLP scalar cleanup, it has several cleanup passes like FRE, DSE. */
|
||||
|
||||
namespace {
|
||||
|
||||
const pass_data pass_data_pre_slp_scalar_cleanup =
|
||||
{
|
||||
GIMPLE_PASS, /* type */
|
||||
"*pre_slp_scalar_cleanup", /* name */
|
||||
OPTGROUP_LOOP, /* optinfo_flags */
|
||||
TV_SCALAR_CLEANUP, /* tv_id */
|
||||
( PROP_cfg | PROP_ssa ), /* properties_required */
|
||||
0, /* properties_provided */
|
||||
0, /* properties_destroyed */
|
||||
0, /* todo_flags_start */
|
||||
0, /* todo_flags_finish */
|
||||
};
|
||||
|
||||
class pass_pre_slp_scalar_cleanup : public gimple_opt_pass
|
||||
{
|
||||
public:
|
||||
pass_pre_slp_scalar_cleanup (gcc::context *ctxt)
|
||||
: gimple_opt_pass (pass_data_pre_slp_scalar_cleanup, ctxt)
|
||||
{
|
||||
}
|
||||
|
||||
virtual bool
|
||||
gate (function *fun)
|
||||
{
|
||||
return flag_tree_slp_vectorize
|
||||
&& (fun->pending_TODOs & PENDING_TODO_force_next_scalar_cleanup);
|
||||
}
|
||||
|
||||
virtual unsigned int
|
||||
execute (function *fun)
|
||||
{
|
||||
fun->pending_TODOs &= ~PENDING_TODO_force_next_scalar_cleanup;
|
||||
return 0;
|
||||
}
|
||||
|
||||
}; // class pass_pre_slp_scalar_cleanup
|
||||
|
||||
} // anon namespace
|
||||
|
||||
gimple_opt_pass *
|
||||
make_pass_pre_slp_scalar_cleanup (gcc::context *ctxt)
|
||||
{
|
||||
return new pass_pre_slp_scalar_cleanup (ctxt);
|
||||
}
|
||||
|
||||
/* Set the static pass number of pass PASS to ID and record that
|
||||
in the mapping from static pass number to pass. */
|
||||
|
@ -289,11 +289,16 @@ along with GCC; see the file COPYING3. If not see
|
||||
/* pass_vectorize must immediately follow pass_if_conversion.
|
||||
Please do not add any other passes in between. */
|
||||
NEXT_PASS (pass_vectorize);
|
||||
PUSH_INSERT_PASSES_WITHIN (pass_vectorize)
|
||||
PUSH_INSERT_PASSES_WITHIN (pass_vectorize)
|
||||
NEXT_PASS (pass_dce);
|
||||
POP_INSERT_PASSES ()
|
||||
NEXT_PASS (pass_predcom);
|
||||
POP_INSERT_PASSES ()
|
||||
NEXT_PASS (pass_predcom);
|
||||
NEXT_PASS (pass_complete_unroll);
|
||||
NEXT_PASS (pass_pre_slp_scalar_cleanup);
|
||||
PUSH_INSERT_PASSES_WITHIN (pass_pre_slp_scalar_cleanup)
|
||||
NEXT_PASS (pass_fre, false /* may_iterate */);
|
||||
NEXT_PASS (pass_dse);
|
||||
POP_INSERT_PASSES ()
|
||||
NEXT_PASS (pass_slp_vectorize);
|
||||
NEXT_PASS (pass_loop_prefetch);
|
||||
/* Run IVOPTs after the last pass that uses data-reference analysis
|
||||
|
58
gcc/testsuite/gcc.dg/tree-ssa/pr96789.c
Normal file
58
gcc/testsuite/gcc.dg/tree-ssa/pr96789.c
Normal file
@ -0,0 +1,58 @@
|
||||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2 -funroll-loops -ftree-vectorize -fdump-tree-dse-details" } */
|
||||
|
||||
/* Test if scalar cleanup pass takes effects, mainly check
|
||||
its secondary pass DSE can remove dead stores on array
|
||||
tmp. */
|
||||
|
||||
#include "stdint.h"
|
||||
|
||||
static inline void
|
||||
foo (int16_t *diff, int i_size, uint8_t *val1, int i_val1, uint8_t *val2,
|
||||
int i_val2)
|
||||
{
|
||||
for (int y = 0; y < i_size; y++)
|
||||
{
|
||||
for (int x = 0; x < i_size; x++)
|
||||
diff[x + y * i_size] = val1[x] - val2[x];
|
||||
val1 += i_val1;
|
||||
val2 += i_val2;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
bar (int16_t res[16], uint8_t *val1, uint8_t *val2)
|
||||
{
|
||||
int16_t d[16];
|
||||
int16_t tmp[16];
|
||||
|
||||
foo (d, 4, val1, 16, val2, 32);
|
||||
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
int s03 = d[i * 4 + 0] + d[i * 4 + 3];
|
||||
int s12 = d[i * 4 + 1] + d[i * 4 + 2];
|
||||
int d03 = d[i * 4 + 0] - d[i * 4 + 3];
|
||||
int d12 = d[i * 4 + 1] - d[i * 4 + 2];
|
||||
|
||||
tmp[0 * 4 + i] = s03 + s12;
|
||||
tmp[1 * 4 + i] = 2 * d03 + d12;
|
||||
tmp[2 * 4 + i] = s03 - s12;
|
||||
tmp[3 * 4 + i] = d03 - 2 * d12;
|
||||
}
|
||||
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
int s03 = tmp[i * 4 + 0] + tmp[i * 4 + 3];
|
||||
int s12 = tmp[i * 4 + 1] + tmp[i * 4 + 2];
|
||||
int d03 = tmp[i * 4 + 0] - tmp[i * 4 + 3];
|
||||
int d12 = tmp[i * 4 + 1] - tmp[i * 4 + 2];
|
||||
|
||||
res[i * 4 + 0] = s03 + s12;
|
||||
res[i * 4 + 1] = 2 * d03 + d12;
|
||||
res[i * 4 + 2] = s03 - s12;
|
||||
res[i * 4 + 3] = d03 - 2 * d12;
|
||||
}
|
||||
}
|
||||
|
||||
/* { dg-final { scan-tree-dump {Deleted dead store:.*tmp} "dse3" } } */
|
@ -17,5 +17,5 @@ int foo (int *p, int b)
|
||||
|
||||
/* { dg-final { scan-tree-dump-not "Deleted dead store" "dse1"} } */
|
||||
/* { dg-final { scan-tree-dump-not "Deleted dead store" "dse2"} } */
|
||||
/* { dg-final { scan-tree-dump-not "Deleted dead store" "dse3"} } */
|
||||
/* { dg-final { scan-tree-dump-not "Deleted dead store" "dse4"} } */
|
||||
|
||||
|
@ -22,5 +22,5 @@ foo(int cond, struct z *s)
|
||||
|
||||
/* { dg-final { scan-tree-dump-times "Deleted dead store" 3 "dse1"} } */
|
||||
/* { dg-final { scan-tree-dump-not "Deleted dead store" "dse2"} } */
|
||||
/* { dg-final { scan-tree-dump-not "Deleted dead store" "dse3"} } */
|
||||
/* { dg-final { scan-tree-dump-not "Deleted dead store" "dse4"} } */
|
||||
|
||||
|
@ -10,7 +10,10 @@ foo (int *a, int *b)
|
||||
a[i] = b[0] + b[1] + b[i+1] + b[i+2];
|
||||
}
|
||||
|
||||
void bar (int *a, int *b)
|
||||
/* Disable pre-slp FRE to avoid unexpected SLP on the epilogue
|
||||
of the 1st loop. */
|
||||
void __attribute__((optimize("-fno-tree-fre")))
|
||||
bar (int *a, int *b)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < (ARR_SIZE - 2); ++i)
|
||||
|
@ -194,6 +194,7 @@ DEFTIMEVAR (TV_TREE_LOOP_UNSWITCH , "tree loop unswitching")
|
||||
DEFTIMEVAR (TV_LOOP_SPLIT , "loop splitting")
|
||||
DEFTIMEVAR (TV_LOOP_JAM , "unroll and jam")
|
||||
DEFTIMEVAR (TV_COMPLETE_UNROLL , "complete unrolling")
|
||||
DEFTIMEVAR (TV_SCALAR_CLEANUP , "scalar cleanup")
|
||||
DEFTIMEVAR (TV_TREE_PARALLELIZE_LOOPS, "tree parallelize loops")
|
||||
DEFTIMEVAR (TV_TREE_VECTORIZATION , "tree vectorization")
|
||||
DEFTIMEVAR (TV_TREE_SLP_VECTORIZATION, "tree slp vectorization")
|
||||
|
@ -310,6 +310,11 @@ protected:
|
||||
|
||||
#define TODO_verify_all TODO_verify_il
|
||||
|
||||
/* To-do flags for pending_TODOs. */
|
||||
|
||||
/* Tell the next scalar cleanup pass that there is
|
||||
work for it to do. */
|
||||
#define PENDING_TODO_force_next_scalar_cleanup (1 << 1)
|
||||
|
||||
/* Register pass info. */
|
||||
|
||||
@ -380,6 +385,7 @@ extern gimple_opt_pass *make_pass_simduid_cleanup (gcc::context *ctxt);
|
||||
extern gimple_opt_pass *make_pass_slp_vectorize (gcc::context *ctxt);
|
||||
extern gimple_opt_pass *make_pass_complete_unroll (gcc::context *ctxt);
|
||||
extern gimple_opt_pass *make_pass_complete_unrolli (gcc::context *ctxt);
|
||||
extern gimple_opt_pass *make_pass_pre_slp_scalar_cleanup (gcc::context *ctxt);
|
||||
extern gimple_opt_pass *make_pass_parallelize_loops (gcc::context *ctxt);
|
||||
extern gimple_opt_pass *make_pass_loop_prefetch (gcc::context *ctxt);
|
||||
extern gimple_opt_pass *make_pass_iv_optimize (gcc::context *ctxt);
|
||||
|
@ -1406,6 +1406,9 @@ tree_unroll_loops_completely_1 (bool may_increase_size, bool unroll_outer,
|
||||
bitmap_clear (father_bbs);
|
||||
bitmap_set_bit (father_bbs, loop_father->header->index);
|
||||
}
|
||||
else if (unroll_outer)
|
||||
/* Trigger scalar cleanup once any outermost loop gets unrolled. */
|
||||
cfun->pending_TODOs |= PENDING_TODO_force_next_scalar_cleanup;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user