mirror of
git://gcc.gnu.org/git/gcc.git
synced 2025-04-15 14:01:04 +08:00
vect: Support masked gather loads with SLP
This patch extends the previous SLP gather load support so that it can handle masked loads too. gcc/ * tree-vect-slp.c (arg1_arg4_map): New variable. (vect_get_operand_map): Handle IFN_MASK_GATHER_LOAD. (vect_build_slp_tree_1): Likewise. (vect_build_slp_tree_2): Likewise. * tree-vect-stmts.c (vectorizable_load): Expect the mask to be the last SLP child node rather than the first. gcc/testsuite/ * gcc.dg/vect/vect-gather-3.c: New test. * gcc.dg/vect/vect-gather-4.c: Likewise. * gcc.target/aarch64/sve/mask_gather_load_8.c: Likewise.
This commit is contained in:
parent
32ede1083f
commit
8af3f53d32
64
gcc/testsuite/gcc.dg/vect/vect-gather-3.c
Normal file
64
gcc/testsuite/gcc.dg/vect/vect-gather-3.c
Normal file
@ -0,0 +1,64 @@
|
||||
#include "tree-vect.h"
|
||||
|
||||
#define N 16
|
||||
|
||||
void __attribute__((noipa))
|
||||
f (int *restrict y, int *restrict x, int *restrict indices)
|
||||
{
|
||||
for (int i = 0; i < N; ++i)
|
||||
{
|
||||
y[i * 2] = (indices[i * 2] < N * 2
|
||||
? x[indices[i * 2]] + 1
|
||||
: 1);
|
||||
y[i * 2 + 1] = (indices[i * 2 + 1] < N * 2
|
||||
? x[indices[i * 2 + 1]] + 2
|
||||
: 2);
|
||||
}
|
||||
}
|
||||
|
||||
int y[N * 2];
|
||||
int x[N * 2] = {
|
||||
72704, 52152, 51301, 96681,
|
||||
57937, 60490, 34504, 60944,
|
||||
42225, 28333, 88336, 74300,
|
||||
29250, 20484, 38852, 91536,
|
||||
86917, 63941, 31590, 21998,
|
||||
22419, 26974, 28668, 13968,
|
||||
3451, 20247, 44089, 85521,
|
||||
22871, 87362, 50555, 85939
|
||||
};
|
||||
int indices[N * 2] = {
|
||||
15, 0x10000, 0xcafe0, 19,
|
||||
7, 22, 19, 1,
|
||||
0x20000, 0x70000, 15, 30,
|
||||
5, 12, 11, 11,
|
||||
10, 25, 5, 20,
|
||||
22, 24, 32, 28,
|
||||
30, 19, 6, 0xabcdef,
|
||||
7, 12, 8, 21
|
||||
};
|
||||
int expected[N * 2] = {
|
||||
91537, 2, 1, 22000,
|
||||
60945, 28670, 21999, 52154,
|
||||
1, 2, 91537, 50557,
|
||||
60491, 29252, 74301, 74302,
|
||||
88337, 20249, 60491, 22421,
|
||||
28669, 3453, 1, 22873,
|
||||
50556, 22000, 34505, 2,
|
||||
60945, 29252, 42226, 26976
|
||||
};
|
||||
|
||||
int
|
||||
main (void)
|
||||
{
|
||||
check_vect ();
|
||||
|
||||
f (y, x, indices);
|
||||
for (int i = 0; i < 32; ++i)
|
||||
if (y[i] != expected[i])
|
||||
__builtin_abort ();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* { dg-final { scan-tree-dump "Loop contains only SLP stmts" vect { target { vect_gather_load_ifn && vect_masked_load } } } } */
|
48
gcc/testsuite/gcc.dg/vect/vect-gather-4.c
Normal file
48
gcc/testsuite/gcc.dg/vect/vect-gather-4.c
Normal file
@ -0,0 +1,48 @@
|
||||
/* { dg-do compile } */
|
||||
|
||||
#define N 16
|
||||
|
||||
void
|
||||
f1 (int *restrict y, int *restrict x1, int *restrict x2,
|
||||
int *restrict indices)
|
||||
{
|
||||
for (int i = 0; i < N; ++i)
|
||||
{
|
||||
y[i * 2] = (indices[i * 2] < N * 2
|
||||
? x1[indices[i * 2]] + 1
|
||||
: 1);
|
||||
y[i * 2 + 1] = (indices[i * 2 + 1] < N * 2
|
||||
? x2[indices[i * 2 + 1]] + 2
|
||||
: 2);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
f2 (int *restrict y, int *restrict x, int *restrict indices)
|
||||
{
|
||||
for (int i = 0; i < N; ++i)
|
||||
{
|
||||
y[i * 2] = (indices[i * 2] < N * 2
|
||||
? x[indices[i * 2]] + 1
|
||||
: 1);
|
||||
y[i * 2 + 1] = (indices[i * 2 + 1] < N * 2
|
||||
? x[indices[i * 2 + 1] * 2] + 2
|
||||
: 2);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
f3 (int *restrict y, int *restrict x, int *restrict indices)
|
||||
{
|
||||
for (int i = 0; i < N; ++i)
|
||||
{
|
||||
y[i * 2] = (indices[i * 2] < N * 2
|
||||
? x[indices[i * 2]] + 1
|
||||
: 1);
|
||||
y[i * 2 + 1] = (indices[i * 2 + 1] < N * 2
|
||||
? x[(unsigned int) indices[i * 2 + 1]] + 2
|
||||
: 2);
|
||||
}
|
||||
}
|
||||
|
||||
/* { dg-final { scan-tree-dump-not "Loop contains only SLP stmts" vect { target vect_gather_load_ifn } } } */
|
65
gcc/testsuite/gcc.target/aarch64/sve/mask_gather_load_8.c
Normal file
65
gcc/testsuite/gcc.target/aarch64/sve/mask_gather_load_8.c
Normal file
@ -0,0 +1,65 @@
|
||||
/* { dg-do compile } */
|
||||
/* { dg-options "-O3 -fno-vect-cost-model" } */
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
void
|
||||
f1 (int32_t *restrict y, int32_t *restrict x, int32_t *restrict index)
|
||||
{
|
||||
for (int i = 0; i < 100; ++i)
|
||||
{
|
||||
y[i * 2] = (index[i * 2] < 128
|
||||
? x[index[i * 2]] + 1
|
||||
: 1);
|
||||
y[i * 2 + 1] = (index[i * 2 + 1] < 128
|
||||
? x[index[i * 2 + 1]] + 2
|
||||
: 2);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
f2 (int32_t *restrict y, int32_t *restrict x, uint32_t *restrict index)
|
||||
{
|
||||
for (int i = 0; i < 100; ++i)
|
||||
{
|
||||
y[i * 2] = (index[i * 2] < 128
|
||||
? x[index[i * 2]] + 1
|
||||
: 1);
|
||||
y[i * 2 + 1] = (index[i * 2 + 1] < 128
|
||||
? x[index[i * 2 + 1]] + 2
|
||||
: 2);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
f3 (int32_t *restrict y, int32_t *restrict x, uint64_t *restrict index)
|
||||
{
|
||||
for (int i = 0; i < 100; ++i)
|
||||
{
|
||||
y[i * 2] = (index[i * 2] < 128
|
||||
? x[index[i * 2]] + 1
|
||||
: 1);
|
||||
y[i * 2 + 1] = (index[i * 2 + 1] < 128
|
||||
? x[index[i * 2 + 1]] + 2
|
||||
: 2);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
f4 (int64_t *restrict y, int64_t *restrict x, uint64_t *restrict index)
|
||||
{
|
||||
for (int i = 0; i < 100; ++i)
|
||||
{
|
||||
y[i * 2] = (index[i * 2] < 128
|
||||
? x[index[i * 2]] + 1
|
||||
: 1);
|
||||
y[i * 2 + 1] = (index[i * 2 + 1] < 128
|
||||
? x[index[i * 2 + 1]] + 2
|
||||
: 2);
|
||||
}
|
||||
}
|
||||
|
||||
/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+\.s, sxtw #?2\]} 1 } } */
|
||||
/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+\.s, uxtw #?2\]} 1 } } */
|
||||
/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+\.d, lsl #?2\]} 1 } } */
|
||||
/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+\.d, lsl #?3\]} 1 } } */
|
@ -461,6 +461,7 @@ static const int cond_expr_maps[3][5] = {
|
||||
};
|
||||
static const int arg1_map[] = { 1, 1 };
|
||||
static const int arg2_map[] = { 1, 2 };
|
||||
static const int arg1_arg4_map[] = { 2, 1, 4 };
|
||||
|
||||
/* For most SLP statements, there is a one-to-one mapping between
|
||||
gimple arguments and child nodes. If that is not true for STMT,
|
||||
@ -494,6 +495,9 @@ vect_get_operand_map (const gimple *stmt, unsigned char swap = 0)
|
||||
case IFN_GATHER_LOAD:
|
||||
return arg1_map;
|
||||
|
||||
case IFN_MASK_GATHER_LOAD:
|
||||
return arg1_arg4_map;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@ -1000,7 +1004,9 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
|
||||
else
|
||||
rhs_code = CALL_EXPR;
|
||||
|
||||
if (cfn == CFN_MASK_LOAD || cfn == CFN_GATHER_LOAD)
|
||||
if (cfn == CFN_MASK_LOAD
|
||||
|| cfn == CFN_GATHER_LOAD
|
||||
|| cfn == CFN_MASK_GATHER_LOAD)
|
||||
load_p = true;
|
||||
else if ((internal_fn_p (cfn)
|
||||
&& !vectorizable_internal_fn_p (as_internal_fn (cfn)))
|
||||
@ -1229,7 +1235,9 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
|
||||
} /* Grouped access. */
|
||||
else
|
||||
{
|
||||
if (load_p && rhs_code != CFN_GATHER_LOAD)
|
||||
if (load_p
|
||||
&& rhs_code != CFN_GATHER_LOAD
|
||||
&& rhs_code != CFN_MASK_GATHER_LOAD)
|
||||
{
|
||||
/* Not grouped load. */
|
||||
if (dump_enabled_p ())
|
||||
@ -1711,7 +1719,8 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
|
||||
{
|
||||
if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
|
||||
gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
|
||||
|| gimple_call_internal_p (stmt, IFN_GATHER_LOAD));
|
||||
|| gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
|
||||
|| gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD));
|
||||
else
|
||||
{
|
||||
*max_nunits = this_max_nunits;
|
||||
|
@ -8595,6 +8595,7 @@ vectorizable_load (vec_info *vinfo,
|
||||
return false;
|
||||
|
||||
tree mask = NULL_TREE, mask_vectype = NULL_TREE;
|
||||
int mask_index = -1;
|
||||
if (gassign *assign = dyn_cast <gassign *> (stmt_info->stmt))
|
||||
{
|
||||
scalar_dest = gimple_assign_lhs (assign);
|
||||
@ -8626,12 +8627,12 @@ vectorizable_load (vec_info *vinfo,
|
||||
if (!scalar_dest)
|
||||
return false;
|
||||
|
||||
int mask_index = internal_fn_mask_index (ifn);
|
||||
mask_index = internal_fn_mask_index (ifn);
|
||||
/* ??? For SLP the mask operand is always last. */
|
||||
if (mask_index >= 0 && slp_node)
|
||||
mask_index = SLP_TREE_CHILDREN (slp_node).length () - 1;
|
||||
if (mask_index >= 0
|
||||
&& !vect_check_scalar_mask (vinfo, stmt_info, slp_node,
|
||||
/* ??? For SLP we only have operands for
|
||||
the mask operand. */
|
||||
slp_node ? 0 : mask_index,
|
||||
&& !vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_index,
|
||||
&mask, NULL, &mask_dt, &mask_vectype))
|
||||
return false;
|
||||
}
|
||||
@ -9393,8 +9394,14 @@ vectorizable_load (vec_info *vinfo,
|
||||
vec<tree> vec_offsets = vNULL;
|
||||
auto_vec<tree> vec_masks;
|
||||
if (mask)
|
||||
vect_get_vec_defs (vinfo, stmt_info, slp_node, ncopies,
|
||||
mask, &vec_masks, mask_vectype, NULL_TREE);
|
||||
{
|
||||
if (slp_node)
|
||||
vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[mask_index],
|
||||
&vec_masks);
|
||||
else
|
||||
vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies, mask,
|
||||
&vec_masks, mask_vectype);
|
||||
}
|
||||
tree vec_mask = NULL_TREE;
|
||||
poly_uint64 group_elt = 0;
|
||||
for (j = 0; j < ncopies; j++)
|
||||
|
Loading…
x
Reference in New Issue
Block a user