mirror of
git://gcc.gnu.org/git/gcc.git
synced 2025-03-19 15:11:08 +08:00
PR target/107548: Handle vec_select in STV on x86.
This patch enhances x86's STV pass to handle VEC_SELECT during general scalar chain conversion, performing SImode scalar extraction from V4SI and DImode scalar extraction from V2DI in vector registers. The motivating test case from bugzilla is: typedef unsigned int v4si __attribute__((vector_size(16))); unsigned int f (v4si a, v4si b) { a[0] += b[0]; return a[0] + a[1]; } currently with -O2 -march=znver2 this generates: vpextrd $1, %xmm0, %edx vmovd %xmm0, %eax addl %edx, %eax vmovd %xmm1, %edx addl %edx, %eax ret which performs three transfers from the vector unit to the scalar unit, and performs the two additions there. With this patch, we now generate: vmovdqa %xmm0, %xmm2 vpshufd $85, %xmm0, %xmm0 vpaddd %xmm0, %xmm2, %xmm0 vpaddd %xmm1, %xmm0, %xmm0 vmovd %xmm0, %eax ret which performs the two additions in the vector unit, and then transfers the result to the scalar unit. Technically the (cheap) movdqa isn't needed with better register allocation (or this could be cleaned up during peephole2), but even so this transform is still a win. 2022-12-23 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog PR target/107548 * config/i386/i386-features.cc (scalar_chain::add_insn): The operands of a VEC_SELECT don't need to added to the scalar chain. (general_scalar_chain::compute_convert_gain) <case VEC_SELECT>: Provide gains for performing STV on a VEC_SELECT. (general_scalar_chain::convert_insn): Convert VEC_SELECT to pshufd, psrldq or no-op. (general_scalar_to_vector_candidate_p): Handle VEC_SELECT of a single element from a vector register to a scalar register. gcc/testsuite/ChangeLog PR target/107548 * gcc.target/i386/pr107548-1.c: New test V4SI case. * gcc.target/i386/pr107548-2.c: New test V2DI case.
This commit is contained in:
parent
24a7980d0f
commit
0b2c1369d0
@ -429,6 +429,11 @@ scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
|
||||
for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
|
||||
if (!HARD_REGISTER_P (DF_REF_REG (ref)))
|
||||
analyze_register_chain (candidates, ref);
|
||||
|
||||
/* The operand(s) of VEC_SELECT don't need to be converted/convertible. */
|
||||
if (def_set && GET_CODE (SET_SRC (def_set)) == VEC_SELECT)
|
||||
return;
|
||||
|
||||
for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
|
||||
if (!DF_REF_REG_MEM_P (ref))
|
||||
analyze_register_chain (candidates, ref);
|
||||
@ -629,6 +634,23 @@ general_scalar_chain::compute_convert_gain ()
|
||||
}
|
||||
break;
|
||||
|
||||
case VEC_SELECT:
|
||||
if (XVECEXP (XEXP (src, 1), 0, 0) == const0_rtx)
|
||||
{
|
||||
// movd (4 bytes) replaced with movdqa (4 bytes).
|
||||
if (!optimize_insn_for_size_p ())
|
||||
igain += ix86_cost->sse_to_integer - ix86_cost->xmm_move;
|
||||
}
|
||||
else
|
||||
{
|
||||
// pshufd; movd replaced with pshufd.
|
||||
if (optimize_insn_for_size_p ())
|
||||
igain += COSTS_N_BYTES (4);
|
||||
else
|
||||
igain += ix86_cost->sse_to_integer;
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
gcc_unreachable ();
|
||||
}
|
||||
@ -1167,6 +1189,24 @@ general_scalar_chain::convert_insn (rtx_insn *insn)
|
||||
convert_op (&src, insn);
|
||||
break;
|
||||
|
||||
case VEC_SELECT:
|
||||
if (XVECEXP (XEXP (src, 1), 0, 0) == const0_rtx)
|
||||
src = XEXP (src, 0);
|
||||
else if (smode == DImode)
|
||||
{
|
||||
rtx tmp = gen_lowpart (V1TImode, XEXP (src, 0));
|
||||
dst = gen_lowpart (V1TImode, dst);
|
||||
src = gen_rtx_LSHIFTRT (V1TImode, tmp, GEN_INT (64));
|
||||
}
|
||||
else
|
||||
{
|
||||
rtx tmp = XVECEXP (XEXP (src, 1), 0, 0);
|
||||
rtvec vec = gen_rtvec (4, tmp, tmp, tmp, tmp);
|
||||
rtx par = gen_rtx_PARALLEL (VOIDmode, vec);
|
||||
src = gen_rtx_VEC_SELECT (vmode, XEXP (src, 0), par);
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
gcc_unreachable ();
|
||||
}
|
||||
@ -1930,6 +1970,16 @@ general_scalar_to_vector_candidate_p (rtx_insn *insn, enum machine_mode mode)
|
||||
case CONST_INT:
|
||||
return REG_P (dst);
|
||||
|
||||
case VEC_SELECT:
|
||||
/* Excluding MEM_P (dst) avoids intefering with vpextr[dq]. */
|
||||
return REG_P (dst)
|
||||
&& REG_P (XEXP (src, 0))
|
||||
&& GET_MODE (XEXP (src, 0)) == (mode == DImode ? V2DImode
|
||||
: V4SImode)
|
||||
&& GET_CODE (XEXP (src, 1)) == PARALLEL
|
||||
&& XVECLEN (XEXP (src, 1), 0) == 1
|
||||
&& CONST_INT_P (XVECEXP (XEXP (src, 1), 0, 0));
|
||||
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
25
gcc/testsuite/gcc.target/i386/pr107548-1.c
Normal file
25
gcc/testsuite/gcc.target/i386/pr107548-1.c
Normal file
@ -0,0 +1,25 @@
|
||||
/* { dg-do compile { target { ! ia32 } } } */
|
||||
/* { dg-options "-O2 -mstv -mno-stackrealign" } */
|
||||
typedef unsigned int v4si __attribute__((vector_size(16)));
|
||||
|
||||
unsigned int foo1 (v4si a, v4si b)
|
||||
{
|
||||
a[0] += b[0];
|
||||
return a[0] + a[1];
|
||||
}
|
||||
|
||||
unsigned int foo2 (v4si a, v4si b)
|
||||
{
|
||||
a[0] += b[0];
|
||||
return a[0] + a[2];
|
||||
}
|
||||
|
||||
unsigned int foo3 (v4si a, v4si b)
|
||||
{
|
||||
a[0] += b[0];
|
||||
return a[0] + a[3];
|
||||
}
|
||||
|
||||
/* { dg-final { scan-assembler-times "\tmovd\t" 3 } } */
|
||||
/* { dg-final { scan-assembler-times "paddd" 6 } } */
|
||||
/* { dg-final { scan-assembler-not "addl" } } */
|
13
gcc/testsuite/gcc.target/i386/pr107548-2.c
Normal file
13
gcc/testsuite/gcc.target/i386/pr107548-2.c
Normal file
@ -0,0 +1,13 @@
|
||||
/* { dg-do compile { target { ! ia32 } } } */
|
||||
/* { dg-options "-O2 -mstv -mno-stackrealign" } */
|
||||
typedef unsigned long long v2di __attribute__((vector_size(16)));
|
||||
|
||||
unsigned long long foo(v2di a, v2di b)
|
||||
{
|
||||
a[0] += b[0];
|
||||
return a[0] + a[1];
|
||||
}
|
||||
|
||||
/* { dg-final { scan-assembler-not "\taddq\t" } } */
|
||||
/* { dg-final { scan-assembler-times "paddq" 2 } } */
|
||||
/* { dg-final { scan-assembler "psrldq" } } */
|
Loading…
x
Reference in New Issue
Block a user