re PR middle-end/82853 (Optimize x % 3 == 0 without modulo)

PR middle-end/82853
	* expr.h (maybe_optimize_mod_cmp): Declare.
	* expr.c (mod_inv): New function.
	(maybe_optimize_mod_cmp): New function.
	(do_store_flag): Use it.
	* cfgexpand.c (expand_gimple_cond): Likewise.

	* gcc.target/i386/pr82853-1.c: New test.
	* gcc.target/i386/pr82853-2.c: New test.

From-SVN: r264248
This commit is contained in:
Jakub Jelinek 2018-09-12 20:28:20 +02:00 committed by Jakub Jelinek
parent 8e36332cf6
commit c0cbe5260f
7 changed files with 302 additions and 1 deletions

View File

@ -1,3 +1,12 @@
2018-09-12 Jakub Jelinek <jakub@redhat.com>
PR middle-end/82853
* expr.h (maybe_optimize_mod_cmp): Declare.
* expr.c (mod_inv): New function.
(maybe_optimize_mod_cmp): New function.
(do_store_flag): Use it.
* cfgexpand.c (expand_gimple_cond): Likewise.
2018-09-09 Cesar Philippidis <cesar@codesourcery.com>
Julian Brown <julian@codesourcery.com>

View File

@ -2477,6 +2477,13 @@ expand_gimple_cond (basic_block bb, gcond *stmt)
}
}
/* Optimize (x % C1) == C2 or (x % C1) != C2 if it is beneficial
into (x - C2) * C3 < C4. */
if ((code == EQ_EXPR || code == NE_EXPR)
&& TREE_CODE (op0) == SSA_NAME
&& TREE_CODE (op1) == INTEGER_CST)
code = maybe_optimize_mod_cmp (code, &op0, &op1);
last2 = last = get_last_insn ();
extract_true_false_edges_from_block (bb, &true_edge, &false_edge);

View File

@ -11491,6 +11491,243 @@ string_constant (tree arg, tree *ptr_offset, tree *mem_size, tree *nonstr)
return init;
}
/* Compute the modular multiplicative inverse of A modulo M
using extended Euclid's algorithm. Assumes A and M are coprime. */
static wide_int
mod_inv (const wide_int &a, const wide_int &b)
{
/* Verify the assumption. */
gcc_checking_assert (wi::eq_p (wi::gcd (a, b), 1));
unsigned int p = a.get_precision () + 1;
gcc_checking_assert (b.get_precision () + 1 == p);
wide_int c = wide_int::from (a, p, UNSIGNED);
wide_int d = wide_int::from (b, p, UNSIGNED);
wide_int x0 = wide_int::from (0, p, UNSIGNED);
wide_int x1 = wide_int::from (1, p, UNSIGNED);
if (wi::eq_p (b, 1))
return wide_int::from (1, p, UNSIGNED);
while (wi::gt_p (c, 1, UNSIGNED))
{
wide_int t = d;
wide_int q = wi::divmod_trunc (c, d, UNSIGNED, &d);
c = t;
wide_int s = x0;
x0 = wi::sub (x1, wi::mul (q, x0));
x1 = s;
}
if (wi::lt_p (x1, 0, SIGNED))
x1 += d;
return x1;
}
/* Attempt to optimize unsigned (X % C1) == C2 (or (X % C1) != C2).
If C1 is odd to:
(X - C2) * C3 <= C4 (or >), where
C3 is modular multiplicative inverse of C1 and 1<<prec and
C4 is ((1<<prec) - 1) / C1 or ((1<<prec) - 1) / C1 - 1 (the latter
if C2 > ((1<<prec) - 1) % C1).
If C1 is even, S = ctz (C1) and C2 is 0, use
((X * C3) r>> S) <= C4, where C3 is modular multiplicative
inverse of C1>>S and 1<<prec and C4 is (((1<<prec) - 1) / (C1>>S)) >> S.
For signed (X % C1) == 0 if C1 is odd to (all operations in it
unsigned):
(X * C3) + C4 <= 2 * C4, where
C3 is modular multiplicative inverse of (unsigned) C1 and 1<<prec and
C4 is ((1<<(prec - 1) - 1) / C1).
If C1 is even, S = ctz(C1), use
((X * C3) + C4) r>> S <= (C4 >> (S - 1))
where C3 is modular multiplicative inverse of (unsigned)(C1>>S) and 1<<prec
and C4 is ((1<<(prec - 1) - 1) / (C1>>S)) & (-1<<S).
See the Hacker's Delight book, section 10-17. */
enum tree_code
maybe_optimize_mod_cmp (enum tree_code code, tree *arg0, tree *arg1)
{
gcc_checking_assert (code == EQ_EXPR || code == NE_EXPR);
gcc_checking_assert (TREE_CODE (*arg1) == INTEGER_CST);
if (optimize < 2)
return code;
gimple *stmt = get_def_for_expr (*arg0, TRUNC_MOD_EXPR);
if (stmt == NULL)
return code;
tree treeop0 = gimple_assign_rhs1 (stmt);
tree treeop1 = gimple_assign_rhs2 (stmt);
if (TREE_CODE (treeop0) != SSA_NAME
|| TREE_CODE (treeop1) != INTEGER_CST
/* x % pow2 is handled right already. */
|| integer_pow2p (treeop1)
/* Don't optimize the undefined behavior case x % 0;
x % 1 should have been optimized into zero, punt if
it makes it here for whatever reason;
x % -c should have been optimized into x % c. */
|| compare_tree_int (treeop1, 2) <= 0
/* Likewise x % c == d where d >= c should be always false. */
|| tree_int_cst_le (treeop1, *arg1))
return code;
tree type = TREE_TYPE (*arg0);
scalar_int_mode mode;
if (!is_a <scalar_int_mode> (TYPE_MODE (type), &mode))
return code;
if (GET_MODE_BITSIZE (mode) != TYPE_PRECISION (type)
|| TYPE_PRECISION (type) <= 1)
return code;
signop sgn = UNSIGNED;
/* If both operands are known to have the sign bit clear, handle
even the signed modulo case as unsigned. treeop1 is always
positive >= 2, checked above. */
if (!TYPE_UNSIGNED (type) && get_range_pos_neg (treeop0) != 1)
sgn = SIGNED;
if (!TYPE_UNSIGNED (type))
{
if (tree_int_cst_sgn (*arg1) == -1)
return code;
type = unsigned_type_for (type);
if (!type || TYPE_MODE (type) != TYPE_MODE (TREE_TYPE (*arg0)))
return code;
}
int prec = TYPE_PRECISION (type);
wide_int w = wi::to_wide (treeop1);
int shift = wi::ctz (w);
/* Unsigned (X % C1) == C2 is equivalent to (X - C2) % C1 == 0 if
C2 <= -1U % C1, because for any Z >= 0U - C2 in that case (Z % C1) != 0.
If C1 is odd, we can handle all cases by subtracting
C4 below. We could handle even the even C1 and C2 > -1U % C1 cases
e.g. by testing for overflow on the subtraction, punt on that for now
though. */
if ((sgn == SIGNED || shift) && !integer_zerop (*arg1))
{
if (sgn == SIGNED)
return code;
wide_int x = wi::umod_trunc (wi::mask (prec, false, prec), w);
if (wi::gtu_p (wi::to_wide (*arg1), x))
return code;
}
imm_use_iterator imm_iter;
use_operand_p use_p;
FOR_EACH_IMM_USE_FAST (use_p, imm_iter, treeop0)
{
gimple *use_stmt = USE_STMT (use_p);
/* Punt if treeop0 is used in the same bb in a division
or another modulo with the same divisor. We should expect
the division and modulo combined together. */
if (use_stmt == stmt
|| gimple_bb (use_stmt) != gimple_bb (stmt))
continue;
if (!is_gimple_assign (use_stmt)
|| (gimple_assign_rhs_code (use_stmt) != TRUNC_DIV_EXPR
&& gimple_assign_rhs_code (use_stmt) != TRUNC_MOD_EXPR))
continue;
if (gimple_assign_rhs1 (use_stmt) != treeop0
|| !operand_equal_p (gimple_assign_rhs2 (use_stmt), treeop1, 0))
continue;
return code;
}
w = wi::lrshift (w, shift);
wide_int a = wide_int::from (w, prec + 1, UNSIGNED);
wide_int b = wi::shifted_mask (prec, 1, false, prec + 1);
wide_int m = wide_int::from (mod_inv (a, b), prec, UNSIGNED);
tree c3 = wide_int_to_tree (type, m);
tree c5 = NULL_TREE;
wide_int d, e;
if (sgn == UNSIGNED)
{
d = wi::divmod_trunc (wi::mask (prec, false, prec), w, UNSIGNED, &e);
/* Use <= floor ((1<<prec) - 1) / C1 only if C2 <= ((1<<prec) - 1) % C1,
otherwise use < or subtract one from C4. E.g. for
x % 3U == 0 we transform this into x * 0xaaaaaaab <= 0x55555555, but
x % 3U == 1 already needs to be
(x - 1) * 0xaaaaaaabU <= 0x55555554. */
if (!shift && wi::gtu_p (wi::to_wide (*arg1), e))
d -= 1;
if (shift)
d = wi::lrshift (d, shift);
}
else
{
e = wi::udiv_trunc (wi::mask (prec - 1, false, prec), w);
if (!shift)
d = wi::lshift (e, 1);
else
{
e = wi::bit_and (e, wi::mask (shift, true, prec));
d = wi::lrshift (e, shift - 1);
}
c5 = wide_int_to_tree (type, e);
}
tree c4 = wide_int_to_tree (type, d);
rtx op0 = expand_normal (treeop0);
treeop0 = make_tree (TREE_TYPE (treeop0), op0);
bool speed_p = optimize_insn_for_speed_p ();
do_pending_stack_adjust ();
location_t loc = gimple_location (stmt);
struct separate_ops ops;
ops.code = TRUNC_MOD_EXPR;
ops.location = loc;
ops.type = TREE_TYPE (treeop0);
ops.op0 = treeop0;
ops.op1 = treeop1;
ops.op2 = NULL_TREE;
start_sequence ();
rtx mor = expand_expr_real_2 (&ops, NULL_RTX, TYPE_MODE (ops.type),
EXPAND_NORMAL);
rtx_insn *moinsns = get_insns ();
end_sequence ();
unsigned mocost = seq_cost (moinsns, speed_p);
mocost += rtx_cost (mor, mode, EQ, 0, speed_p);
mocost += rtx_cost (expand_normal (*arg1), mode, EQ, 1, speed_p);
tree t = fold_convert_loc (loc, type, treeop0);
if (!integer_zerop (*arg1))
t = fold_build2_loc (loc, MINUS_EXPR, type, t, fold_convert (type, *arg1));
t = fold_build2_loc (loc, MULT_EXPR, type, t, c3);
if (sgn == SIGNED)
t = fold_build2_loc (loc, PLUS_EXPR, type, t, c5);
if (shift)
{
tree s = build_int_cst (NULL_TREE, shift);
t = fold_build2_loc (loc, RROTATE_EXPR, type, t, s);
}
start_sequence ();
rtx mur = expand_normal (t);
rtx_insn *muinsns = get_insns ();
end_sequence ();
unsigned mucost = seq_cost (muinsns, speed_p);
mucost += rtx_cost (mur, mode, LE, 0, speed_p);
mucost += rtx_cost (expand_normal (c4), mode, LE, 1, speed_p);
if (mocost <= mucost)
{
emit_insn (moinsns);
*arg0 = make_tree (TREE_TYPE (*arg0), mor);
return code;
}
emit_insn (muinsns);
*arg0 = make_tree (type, mur);
*arg1 = c4;
return code == EQ_EXPR ? LE_EXPR : GT_EXPR;
}
/* Generate code to calculate OPS, and exploded expression
using a store-flag instruction and return an rtx for the result.
OPS reflects a comparison.
@ -11548,7 +11785,7 @@ do_store_flag (sepops ops, rtx target, machine_mode mode)
STRIP_NOPS (arg0);
STRIP_NOPS (arg1);
/* For vector typed comparisons emit code to generate the desired
all-ones or all-zeros mask. Conveniently use the VEC_COND_EXPR
expander for this. */
@ -11567,6 +11804,24 @@ do_store_flag (sepops ops, rtx target, machine_mode mode)
}
}
/* Optimize (x % C1) == C2 or (x % C1) != C2 if it is beneficial
into (x - C2) * C3 < C4. */
if ((ops->code == EQ_EXPR || ops->code == NE_EXPR)
&& TREE_CODE (arg0) == SSA_NAME
&& TREE_CODE (arg1) == INTEGER_CST)
{
enum tree_code code = maybe_optimize_mod_cmp (ops->code, &arg0, &arg1);
if (code != ops->code)
{
struct separate_ops nops = *ops;
nops.code = ops->code = code;
nops.op0 = arg0;
nops.op1 = arg1;
nops.type = TREE_TYPE (arg0);
return do_store_flag (&nops, target, mode);
}
}
/* Get the rtx comparison code to use. We know that EXP is a comparison
operation of some type. Some comparisons against 1 and -1 can be
converted to comparisons with zero. Do so here so that the tests

View File

@ -290,6 +290,8 @@ expand_normal (tree exp)
a string constant. */
extern tree string_constant (tree, tree *, tree *, tree *);
extern enum tree_code maybe_optimize_mod_cmp (enum tree_code, tree *, tree *);
/* Two different ways of generating switch statements. */
extern int try_casesi (tree, tree, tree, tree, rtx, rtx, rtx, profile_probability);
extern int try_tablejump (tree, tree, tree, tree, rtx, rtx, profile_probability);

View File

@ -1,3 +1,9 @@
2018-09-12 Jakub Jelinek <jakub@redhat.com>
PR middle-end/82853
* gcc.target/i386/pr82853-1.c: New test.
* gcc.target/i386/pr82853-2.c: New test.
2018-09-12 Richard Biener <rguenther@suse.de>
PR tree-optimization/87280

View File

@ -0,0 +1,15 @@
/* PR middle-end/82853 */
/* { dg-do compile } */
/* { dg-options "-O2 -mno-bmi2 -mtune=generic" } */
/* { dg-final { scan-assembler-times "mul\[lq]\t" 7 } } */
/* { dg-final { scan-assembler-not "div\[lq]\t" } } */
/* { dg-final { scan-assembler-not "lea\[lq]\t\[^\n\r]*,\[^\n\r]*,2\\)" } } */
unsigned f1 (unsigned x) { return (x % 679U) == 0; }
unsigned f2 (unsigned x) { return (x % 1738U) == 0; }
void bar (void);
void f3 (unsigned x) { if (x % 3 == 0) bar (); }
void f4 (unsigned x) { if (x % 3 == 1) bar (); }
void f5 (unsigned x) { if (x % 3 == 2) bar (); }
int f6 (int x) { return x % 3 == 0; }
int f7 (int x) { return x % 6 == 0; }

View File

@ -0,0 +1,7 @@
/* PR middle-end/82853 */
/* { dg-do compile } */
/* { dg-options "-O2 -mno-bmi2 -mtune=generic" } */
/* { dg-final { scan-assembler-times "mul\[lq]\t" 2 } } */
/* { dg-final { scan-assembler-not "div\[lq]\t" } } */
unsigned f1 (unsigned x, unsigned *y) { *y = x / 679U; return (x % 679U) == 0; }