mirror of
git://gcc.gnu.org/git/gcc.git
synced 2025-04-17 20:11:06 +08:00
re PR target/88838 ([SVE] Use 32-bit WHILELO in LP64 mode)
gcc/ChangeLog: 2019-06-13 Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org> PR target/88838 * tree-vect-loop-manip.c (vect_set_loop_masks_directly): If the compare_type is not with Pmode size, we will create an IV with Pmode size with truncated use (i.e. converted to the correct type). * tree-vect-loop.c (vect_verify_full_masking): Find IV type. (vect_iv_limit_for_full_masking): New. Factored out of vect_set_loop_condition_masked. * tree-vectorizer.h (LOOP_VINFO_MASK_IV_TYPE): New. (vect_iv_limit_for_full_masking): Declare. gcc/testsuite/ChangeLog: 2019-06-13 Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org> PR target/88838 * gcc.target/aarch64/pr88838.c: New test. * gcc.target/aarch64/sve/while_1.c: Adjust. From-SVN: r272233
This commit is contained in:
parent
fa9863e7d3
commit
9b884225bf
@ -1,3 +1,15 @@
|
||||
2019-06-13 Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org>
|
||||
|
||||
PR target/88838
|
||||
* tree-vect-loop-manip.c (vect_set_loop_masks_directly): If the
|
||||
compare_type is not with Pmode size, we will create an IV with
|
||||
Pmode size with truncated use (i.e. converted to the correct type).
|
||||
* tree-vect-loop.c (vect_verify_full_masking): Find IV type.
|
||||
(vect_iv_limit_for_full_masking): New. Factored out of
|
||||
vect_set_loop_condition_masked.
|
||||
* tree-vectorizer.h (LOOP_VINFO_MASK_IV_TYPE): New.
|
||||
(vect_iv_limit_for_full_masking): Declare.
|
||||
|
||||
2019-06-13 Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org>
|
||||
|
||||
PR target/88834
|
||||
|
@ -1,3 +1,9 @@
|
||||
2019-06-13 Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org>
|
||||
|
||||
PR target/88838
|
||||
* gcc.target/aarch64/pr88838.c: New test.
|
||||
* gcc.target/aarch64/sve/while_1.c: Adjust.
|
||||
|
||||
2019-06-13 Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org>
|
||||
|
||||
PR target/88834
|
||||
|
11
gcc/testsuite/gcc.target/aarch64/pr88838.c
Normal file
11
gcc/testsuite/gcc.target/aarch64/pr88838.c
Normal file
@ -0,0 +1,11 @@
|
||||
/* { dg-do compile } */
|
||||
/* { dg-options "-S -O3 -march=armv8.2-a+sve" } */
|
||||
|
||||
void
|
||||
f (int *restrict x, int *restrict y, int *restrict z, int n)
|
||||
{
|
||||
for (int i = 0; i < n; i += 1)
|
||||
x[i] = y[i] + z[i];
|
||||
}
|
||||
|
||||
/* { dg-final { scan-assembler-not "sxtw" } } */
|
@ -26,14 +26,14 @@
|
||||
TEST_ALL (ADD_LOOP)
|
||||
|
||||
/* { dg-final { scan-assembler-not {\tuqdec} } } */
|
||||
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b, xzr,} 2 } } */
|
||||
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b, x[0-9]+,} 2 } } */
|
||||
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h, xzr,} 2 } } */
|
||||
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h, x[0-9]+,} 2 } } */
|
||||
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s, xzr,} 3 } } */
|
||||
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s, x[0-9]+,} 3 } } */
|
||||
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d, xzr,} 3 } } */
|
||||
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d, x[0-9]+,} 3 } } */
|
||||
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b, wzr,} 2 } } */
|
||||
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b, w[0-9]+,} 2 } } */
|
||||
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h, wzr,} 2 } } */
|
||||
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h, w[0-9]+,} 2 } } */
|
||||
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s, wzr,} 3 } } */
|
||||
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s, w[0-9]+,} 3 } } */
|
||||
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d, wzr,} 3 } } */
|
||||
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d, w[0-9]+,} 3 } } */
|
||||
/* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.b, p[0-7]/z, \[x0, x[0-9]+\]\n} 2 } } */
|
||||
/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.b, p[0-7], \[x0, x[0-9]+\]\n} 2 } } */
|
||||
/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.h, p[0-7]/z, \[x0, x[0-9]+, lsl 1\]\n} 2 } } */
|
||||
|
@ -415,6 +415,7 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
|
||||
bool might_wrap_p)
|
||||
{
|
||||
tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
|
||||
tree iv_type = LOOP_VINFO_MASK_IV_TYPE (loop_vinfo);
|
||||
tree mask_type = rgm->mask_type;
|
||||
unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
|
||||
poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
|
||||
@ -445,11 +446,16 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
|
||||
tree index_before_incr, index_after_incr;
|
||||
gimple_stmt_iterator incr_gsi;
|
||||
bool insert_after;
|
||||
tree zero_index = build_int_cst (compare_type, 0);
|
||||
standard_iv_increment_position (loop, &incr_gsi, &insert_after);
|
||||
create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi,
|
||||
|
||||
tree zero_index = build_int_cst (iv_type, 0);
|
||||
tree step = build_int_cst (iv_type,
|
||||
LOOP_VINFO_VECT_FACTOR (loop_vinfo));
|
||||
/* Create IV of iv_type. */
|
||||
create_iv (zero_index, step, NULL_TREE, loop, &incr_gsi,
|
||||
insert_after, &index_before_incr, &index_after_incr);
|
||||
|
||||
zero_index = build_int_cst (compare_type, 0);
|
||||
tree test_index, test_limit, first_limit;
|
||||
gimple_stmt_iterator *test_gsi;
|
||||
if (might_wrap_p)
|
||||
@ -529,6 +535,10 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
|
||||
tree next_mask = NULL_TREE;
|
||||
tree mask;
|
||||
unsigned int i;
|
||||
gimple_seq test_seq = NULL;
|
||||
test_index = gimple_convert (&test_seq, compare_type, test_index);
|
||||
gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
|
||||
|
||||
FOR_EACH_VEC_ELT_REVERSE (rgm->masks, i, mask)
|
||||
{
|
||||
/* Previous masks will cover BIAS scalars. This mask covers the
|
||||
@ -637,12 +647,12 @@ vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
|
||||
|
||||
tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
|
||||
unsigned int compare_precision = TYPE_PRECISION (compare_type);
|
||||
unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
|
||||
tree orig_niters = niters;
|
||||
|
||||
/* Type of the initial value of NITERS. */
|
||||
tree ni_actual_type = TREE_TYPE (niters);
|
||||
unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
|
||||
tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
|
||||
|
||||
/* Convert NITERS to the same size as the compare. */
|
||||
if (compare_precision > ni_actual_precision
|
||||
@ -661,33 +671,7 @@ vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
|
||||
else
|
||||
niters = gimple_convert (&preheader_seq, compare_type, niters);
|
||||
|
||||
/* Convert skip_niters to the right type. */
|
||||
tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
|
||||
|
||||
/* Now calculate the value that the induction variable must be able
|
||||
to hit in order to ensure that we end the loop with an all-false mask.
|
||||
This involves adding the maximum number of inactive trailing scalar
|
||||
iterations. */
|
||||
widest_int iv_limit;
|
||||
bool known_max_iters = max_loop_iterations (loop, &iv_limit);
|
||||
if (known_max_iters)
|
||||
{
|
||||
if (niters_skip)
|
||||
{
|
||||
/* Add the maximum number of skipped iterations to the
|
||||
maximum iteration count. */
|
||||
if (TREE_CODE (niters_skip) == INTEGER_CST)
|
||||
iv_limit += wi::to_widest (niters_skip);
|
||||
else
|
||||
iv_limit += max_vf - 1;
|
||||
}
|
||||
/* IV_LIMIT is the maximum number of latch iterations, which is also
|
||||
the maximum in-range IV value. Round this value down to the previous
|
||||
vector alignment boundary and then add an extra full iteration. */
|
||||
poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
|
||||
iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
|
||||
}
|
||||
|
||||
widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
|
||||
/* Get the vectorization factor in tree form. */
|
||||
tree vf = build_int_cst (compare_type,
|
||||
LOOP_VINFO_VECT_FACTOR (loop_vinfo));
|
||||
@ -717,7 +701,7 @@ vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
|
||||
/* See whether zero-based IV would ever generate all-false masks
|
||||
before wrapping around. */
|
||||
bool might_wrap_p
|
||||
= (!known_max_iters
|
||||
= (iv_limit == -1
|
||||
|| (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter,
|
||||
UNSIGNED)
|
||||
> compare_precision));
|
||||
|
@ -1030,6 +1030,8 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
|
||||
{
|
||||
struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
|
||||
unsigned int min_ni_width;
|
||||
unsigned int max_nscalars_per_iter
|
||||
= vect_get_max_nscalars_per_iter (loop_vinfo);
|
||||
|
||||
/* Use a normal loop if there are no statements that need masking.
|
||||
This only happens in rare degenerate cases: it means that the loop
|
||||
@ -1048,7 +1050,7 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
|
||||
max_ni = wi::smin (max_ni, max_back_edges + 1);
|
||||
|
||||
/* Account for rgroup masks, in which each bit is replicated N times. */
|
||||
max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
|
||||
max_ni *= max_nscalars_per_iter;
|
||||
|
||||
/* Work out how many bits we need to represent the limit. */
|
||||
min_ni_width = wi::min_precision (max_ni, UNSIGNED);
|
||||
@ -1056,6 +1058,14 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
|
||||
/* Find a scalar mode for which WHILE_ULT is supported. */
|
||||
opt_scalar_int_mode cmp_mode_iter;
|
||||
tree cmp_type = NULL_TREE;
|
||||
tree iv_type = NULL_TREE;
|
||||
widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
|
||||
widest_int iv_precision = UINT_MAX;
|
||||
|
||||
if (iv_limit != -1)
|
||||
iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
|
||||
UNSIGNED);
|
||||
|
||||
FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
|
||||
{
|
||||
unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
|
||||
@ -1067,10 +1077,32 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
|
||||
&& can_produce_all_loop_masks_p (loop_vinfo, this_type))
|
||||
{
|
||||
/* Although we could stop as soon as we find a valid mode,
|
||||
it's often better to continue until we hit Pmode, since the
|
||||
operands to the WHILE are more likely to be reusable in
|
||||
address calculations. */
|
||||
cmp_type = this_type;
|
||||
there are at least two reasons why that's not always the
|
||||
best choice:
|
||||
|
||||
- An IV that's Pmode or wider is more likely to be reusable
|
||||
in address calculations than an IV that's narrower than
|
||||
Pmode.
|
||||
|
||||
- Doing the comparison in IV_PRECISION or wider allows
|
||||
a natural 0-based IV, whereas using a narrower comparison
|
||||
type requires mitigations against wrap-around.
|
||||
|
||||
Conversely, if the IV limit is variable, doing the comparison
|
||||
in a wider type than the original type can introduce
|
||||
unnecessary extensions, so picking the widest valid mode
|
||||
is not always a good choice either.
|
||||
|
||||
Here we prefer the first IV type that's Pmode or wider,
|
||||
and the first comparison type that's IV_PRECISION or wider.
|
||||
(The comparison type must be no wider than the IV type,
|
||||
to avoid extensions in the vector loop.)
|
||||
|
||||
??? We might want to try continuing beyond Pmode for ILP32
|
||||
targets if CMP_BITS < IV_PRECISION. */
|
||||
iv_type = this_type;
|
||||
if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
|
||||
cmp_type = this_type;
|
||||
if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
|
||||
break;
|
||||
}
|
||||
@ -1081,6 +1113,7 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
|
||||
return false;
|
||||
|
||||
LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
|
||||
LOOP_VINFO_MASK_IV_TYPE (loop_vinfo) = iv_type;
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -9014,3 +9047,45 @@ optimize_mask_stores (struct loop *loop)
|
||||
add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
|
||||
}
|
||||
}
|
||||
|
||||
/* Decide whether it is possible to use a zero-based induction variable
|
||||
when vectorizing LOOP_VINFO with a fully-masked loop. If it is,
|
||||
return the value that the induction variable must be able to hold
|
||||
in order to ensure that the loop ends with an all-false mask.
|
||||
Return -1 otherwise. */
|
||||
widest_int
|
||||
vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo)
|
||||
{
|
||||
tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
|
||||
struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
|
||||
unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
|
||||
|
||||
/* Calculate the value that the induction variable must be able
|
||||
to hit in order to ensure that we end the loop with an all-false mask.
|
||||
This involves adding the maximum number of inactive trailing scalar
|
||||
iterations. */
|
||||
widest_int iv_limit = -1;
|
||||
if (max_loop_iterations (loop, &iv_limit))
|
||||
{
|
||||
if (niters_skip)
|
||||
{
|
||||
/* Add the maximum number of skipped iterations to the
|
||||
maximum iteration count. */
|
||||
if (TREE_CODE (niters_skip) == INTEGER_CST)
|
||||
iv_limit += wi::to_widest (niters_skip);
|
||||
else
|
||||
iv_limit += max_vf - 1;
|
||||
}
|
||||
else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
|
||||
/* Make a conservatively-correct assumption. */
|
||||
iv_limit += max_vf - 1;
|
||||
|
||||
/* IV_LIMIT is the maximum number of latch iterations, which is also
|
||||
the maximum in-range IV value. Round this value down to the previous
|
||||
vector alignment boundary and then add an extra full iteration. */
|
||||
poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
|
||||
iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
|
||||
}
|
||||
return iv_limit;
|
||||
}
|
||||
|
||||
|
@ -435,6 +435,10 @@ typedef struct _loop_vec_info : public vec_info {
|
||||
is false and vectorized loop otherwise. */
|
||||
tree simd_if_cond;
|
||||
|
||||
/* Type of the IV to use in the WHILE_ULT call for fully-masked
|
||||
loops. */
|
||||
tree iv_type;
|
||||
|
||||
/* Unknown DRs according to which loop was peeled. */
|
||||
struct dr_vec_info *unaligned_dr;
|
||||
|
||||
@ -570,6 +574,7 @@ typedef struct _loop_vec_info : public vec_info {
|
||||
#define LOOP_VINFO_MASKS(L) (L)->masks
|
||||
#define LOOP_VINFO_MASK_SKIP_NITERS(L) (L)->mask_skip_niters
|
||||
#define LOOP_VINFO_MASK_COMPARE_TYPE(L) (L)->mask_compare_type
|
||||
#define LOOP_VINFO_MASK_IV_TYPE(L) (L)->iv_type
|
||||
#define LOOP_VINFO_PTR_MASK(L) (L)->ptr_mask
|
||||
#define LOOP_VINFO_LOOP_NEST(L) (L)->shared->loop_nest
|
||||
#define LOOP_VINFO_DATAREFS(L) (L)->shared->datarefs
|
||||
@ -1582,6 +1587,7 @@ extern tree vect_create_addr_base_for_vector_ref (stmt_vec_info, gimple_seq *,
|
||||
/* FORNOW: Used in tree-parloops.c. */
|
||||
extern stmt_vec_info vect_force_simple_reduction (loop_vec_info, stmt_vec_info,
|
||||
bool *, bool);
|
||||
extern widest_int vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo);
|
||||
/* Used in gimple-loop-interchange.c. */
|
||||
extern bool check_reduction_path (dump_user_location_t, loop_p, gphi *, tree,
|
||||
enum tree_code);
|
||||
|
Loading…
x
Reference in New Issue
Block a user