match.pd: Canonicalize (x + (x << cst)) into (x * cst2) [PR94800]

The popcount* testcases show yet another creative way to write popcount,
but rather than adjusting the popcount matcher to deal with it, I think
we just should canonicalize those (X + (X << C) to X * (1 + (1 << C))
and (X << C1) + (X << C2) to X * ((1 << C1) + (1 << C2)), because for
multiplication we already have simplification rules that can handle nested
multiplication (X * CST1 * CST2), while the the shifts and adds we have
nothing like that.  And user could have written the multiplication anyway,
so if we don't emit the fastest or smallest code for the multiplication by
constant, we should improve that.  At least on the testcases seems the
emitted code is reasonable according to cost, except that perhaps we could
in some cases try to improve expansion of vector multiplication by
uniform constant.

2020-05-05  Jakub Jelinek  <jakub@redhat.com>

	PR tree-optimization/94800
	* match.pd (X + (X << C) to X * (1 + (1 << C)),
	(X << C1) + (X << C2) to X * ((1 << C1) + (1 << C2))): New
	canonicalizations.

	* gcc.dg/tree-ssa/pr94800.c: New test.
	* gcc.dg/tree-ssa/popcount5.c: New test.
	* gcc.dg/tree-ssa/popcount5l.c: New test.
	* gcc.dg/tree-ssa/popcount5ll.c: New test.
This commit is contained in:
Jakub Jelinek 2020-05-05 11:36:47 +02:00
parent 7f916201ac
commit 144aee70b8
7 changed files with 202 additions and 0 deletions

View File

@ -1,5 +1,10 @@
2020-05-05 Jakub Jelinek <jakub@redhat.com>
PR tree-optimization/94800
* match.pd (X + (X << C) to X * (1 + (1 << C)),
(X << C1) + (X << C2) to X * ((1 << C1) + (1 << C2))): New
canonicalizations.
PR target/94942
* config/i386/mmx.md (*vec_dupv4hi): Use xYw constraints instead of Yv.

View File

@ -2570,6 +2570,41 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
&& single_use (@3))
(mult (plusminus @2 { build_one_cst (type); }) @0))))))
#if GIMPLE
/* Canonicalize X + (X << C) into X * (1 + (1 << C)) and
(X << C1) + (X << C2) into X * ((1 << C1) + (1 << C2)). */
(simplify
(plus:c @0 (lshift:s @0 INTEGER_CST@1))
(if (ANY_INTEGRAL_TYPE_P (TREE_TYPE (@0))
&& tree_fits_uhwi_p (@1)
&& tree_to_uhwi (@1) < element_precision (type))
(with { tree t = type;
if (!TYPE_OVERFLOW_WRAPS (t)) t = unsigned_type_for (t);
wide_int w = wi::set_bit_in_zero (tree_to_uhwi (@1),
element_precision (type));
w += 1;
tree cst = wide_int_to_tree (VECTOR_TYPE_P (t) ? TREE_TYPE (t)
: t, w);
cst = build_uniform_cst (t, cst); }
(convert (mult (convert:t @0) { cst; })))))
(simplify
(plus (lshift:s @0 INTEGER_CST@1) (lshift:s @0 INTEGER_CST@2))
(if (ANY_INTEGRAL_TYPE_P (TREE_TYPE (@0))
&& tree_fits_uhwi_p (@1)
&& tree_to_uhwi (@1) < element_precision (type)
&& tree_fits_uhwi_p (@2)
&& tree_to_uhwi (@2) < element_precision (type))
(with { tree t = type;
if (!TYPE_OVERFLOW_WRAPS (t)) t = unsigned_type_for (t);
unsigned int prec = element_precision (type);
wide_int w = wi::set_bit_in_zero (tree_to_uhwi (@1), prec);
w += wi::set_bit_in_zero (tree_to_uhwi (@2), prec);
tree cst = wide_int_to_tree (VECTOR_TYPE_P (t) ? TREE_TYPE (t)
: t, w);
cst = build_uniform_cst (t, cst); }
(convert (mult (convert:t @0) { cst; })))))
#endif
/* Simplifications of MIN_EXPR, MAX_EXPR, fmin() and fmax(). */
(for minmax (min max FMIN_ALL FMAX_ALL)

View File

@ -1,5 +1,11 @@
2020-05-05 Jakub Jelinek <jakub@redhat.com>
PR tree-optimization/94800
* gcc.dg/tree-ssa/pr94800.c: New test.
* gcc.dg/tree-ssa/popcount5.c: New test.
* gcc.dg/tree-ssa/popcount5l.c: New test.
* gcc.dg/tree-ssa/popcount5ll.c: New test.
PR target/94942
* gcc.target/i386/pr94942.c: New test.

View File

@ -0,0 +1,22 @@
/* PR tree-optimization/94800 */
/* { dg-do compile } */
/* { dg-require-effective-target popcount } */
/* { dg-require-effective-target int32plus } */
/* { dg-options "-O2 -fdump-tree-optimized" } */
const unsigned m1 = 0x55555555UL;
const unsigned m2 = 0x33333333UL;
const unsigned m4 = 0x0F0F0F0FUL;
const int shift = 24;
int popcount64c(unsigned x)
{
x -= (x >> 1) & m1;
x = (x & m2) + ((x >> 2) & m2);
x = (x + (x >> 4)) & m4;
x += (x << 8);
x += (x << 16);
return x >> shift;
}
/* { dg-final { scan-tree-dump-times "\.POPCOUNT" 1 "optimized" } } */

View File

@ -0,0 +1,32 @@
/* PR tree-optimization/94800 */
/* { dg-do compile { target int32plus } } */
/* { dg-require-effective-target popcountl } */
/* { dg-options "-O2 -fdump-tree-optimized" } */
#if __SIZEOF_LONG__ == 4
const unsigned long m1 = 0x55555555UL;
const unsigned long m2 = 0x33333333UL;
const unsigned long m4 = 0x0F0F0F0FUL;
const int shift = 24;
#else
const unsigned long m1 = 0x5555555555555555UL;
const unsigned long m2 = 0x3333333333333333UL;
const unsigned long m4 = 0x0f0f0f0f0f0f0f0fUL;
const int shift = 56;
#endif
int popcount64c(unsigned long x)
{
x -= (x >> 1) & m1;
x = (x & m2) + ((x >> 2) & m2);
x = (x + (x >> 4)) & m4;
x += (x << 8);
x += (x << 16);
#if __SIZEOF_LONG__ != 4
x += (x << 32);
#endif
return x >> shift;
}
/* { dg-final { scan-tree-dump-times "\.POPCOUNT" 1 "optimized" } } */

View File

@ -0,0 +1,22 @@
/* PR tree-optimization/94800 */
/* { dg-do compile } */
/* { dg-require-effective-target popcountll } */
/* { dg-options "-O2 -fdump-tree-optimized" } */
const unsigned long long m1 = 0x5555555555555555ULL;
const unsigned long long m2 = 0x3333333333333333ULL;
const unsigned long long m4 = 0x0F0F0F0F0F0F0F0FULL;
const int shift = 56;
int popcount64c(unsigned long long x)
{
x -= (x >> 1) & m1;
x = (x & m2) + ((x >> 2) & m2);
x = (x + (x >> 4)) & m4;
x += (x << 8);
x += (x << 16);
x += (x << 32);
return x >> shift;
}
/* { dg-final { scan-tree-dump-times "\.POPCOUNT" 1 "optimized" } } */

View File

@ -0,0 +1,80 @@
/* PR tree-optimization/94800 */
/* { dg-do compile { target { ilp32 || lp64 } } } */
/* { dg-options "-O2 -fdump-tree-optimized" } */
/* { dg-final { scan-tree-dump-times " \* 72340172838076673" 2 "optimized" } } */
/* { dg-final { scan-tree-dump-times " \* 16843009" 2 "optimized" } } */
/* { dg-final { scan-tree-dump-times " \* 289360691352306692" 2 "optimized" } } */
/* { dg-final { scan-tree-dump-times " \* 1229782938247303441" 2 "optimized" } } */
/* { dg-final { scan-tree-dump-not "<<" "optimized" } } */
unsigned long long int
foo (unsigned long long int x)
{
unsigned long long int a = x + (x << 8);
unsigned long long int b = a + (a << 16);
unsigned long long int c = b + (b << 32);
return c;
}
unsigned int
bar (unsigned int x)
{
unsigned int a = x + (x << 8);
unsigned int b = a + (a << 16);
return b;
}
unsigned long long int
baz (unsigned long long int x)
{
unsigned long long int a = (x << 2) + (x << 10);
unsigned long long int b = a + (a << 16);
unsigned long long int c = b + (b << 32);
return c;
}
unsigned long long int
qux (unsigned long long int x)
{
unsigned long long int a = x + (x << 4);
unsigned long long int b = a + (a << 8);
unsigned long long int c = b + (b << 16);
unsigned long long int d = c + (c << 32);
return d;
}
long long int
quux (long long int x)
{
long long int a = x + (x << 8);
long long int b = a + (a << 16);
long long int c = b + (b << 32);
return c;
}
int
corge (int x)
{
int a = x + (x << 8);
int b = a + (a << 16);
return b;
}
long long int
garply (long long int x)
{
long long int a = (x << 2) + (x << 10);
long long int b = a + (a << 16);
long long int c = b + (b << 32);
return c;
}
long long int
waldo (long long int x)
{
long long int a = x + (x << 4);
long long int b = a + (a << 8);
long long int c = b + (b << 16);
long long int d = c + (c << 32);
return d;
}