arm: Improve thumb1_gen_const_int

Enable thumb1_gen_const_int to generate RTL or asm depending on the
context, so that we avoid duplicating code to handle constants in
Thumb-1 with -mpure-code.

Use a template so that the algorithm is effectively shared, and
rely on two classes to handle the actual emission as RTL or asm.

The generated sequence is improved to handle right-shiftable and small
values with less instructions. We now generate:

128:
        movs    r0, r0, #128
264:
        movs    r3, #33
        lsls    r3, #3
510:
        movs    r3, #255
        lsls    r3, #1
512:
        movs    r3, #1
        lsls    r3, #9
764:
        movs    r3, #191
        lsls    r3, #2
65536:
        movs    r3, #1
        lsls    r3, #16
0x123456:
        movs    r3, #18 ;0x12
        lsls    r3, #8
        adds    r3, #52 ;0x34
        lsls    r3, #8
        adds    r3, #86 ;0x56
0x1123456:
        movs    r3, #137 ;0x89
        lsls    r3, #8
        adds    r3, #26 ;0x1a
        lsls    r3, #8
        adds    r3, #43 ;0x2b
        lsls    r3, #1
0x1000010:
        movs    r3, #16
        lsls    r3, #16
        adds    r3, #1
        lsls    r3, #4
0x1000011:
        movs    r3, #1
        lsls    r3, #24
        adds    r3, #17
-8192:
	movs	r3, #1
	lsls	r3, #13
	rsbs	r3, #0

The patch adds a testcase which does not fully exercise
thumb1_gen_const_int, as other existing patterns already catch small
constants.  These parts of thumb1_gen_const_int are used by
arm_thumb1_mi_thunk.

2020-11-02  Christophe Lyon  <christophe.lyon@linaro.org>

	gcc/
	* config/arm/arm.c (thumb1_const_rtl, thumb1_const_print): New
	classes.
	(thumb1_gen_const_int): Rename to ...
	(thumb1_gen_const_int_1): ... New helper function. Add capability
	to emit either RTL or asm, improve generated code.
	(thumb1_gen_const_int_rtl): New function.
	* config/arm/arm-protos.h (thumb1_gen_const_int): Rename to
	thumb1_gen_const_int_rtl.
	* config/arm/thumb1.md: Call thumb1_gen_const_int_rtl instead
	of thumb1_gen_const_int.

	gcc/testsuite/
	* gcc.target/arm/pure-code/no-literal-pool-m0.c: New.
This commit is contained in:
Christophe Lyon 2020-11-02 14:39:24 +00:00
parent 79680c1d5c
commit 011f5e92f8
4 changed files with 369 additions and 34 deletions

View File

@ -74,7 +74,7 @@ extern bool arm_small_register_classes_for_mode_p (machine_mode);
extern int const_ok_for_arm (HOST_WIDE_INT);
extern int const_ok_for_op (HOST_WIDE_INT, enum rtx_code);
extern int const_ok_for_dimode_op (HOST_WIDE_INT, enum rtx_code);
extern void thumb1_gen_const_int (rtx, HOST_WIDE_INT);
extern void thumb1_gen_const_int_rtl (rtx, HOST_WIDE_INT);
extern int arm_split_constant (RTX_CODE, machine_mode, rtx,
HOST_WIDE_INT, rtx, rtx, int);
extern int legitimate_pic_operand_p (rtx);

View File

@ -4528,38 +4528,6 @@ const_ok_for_dimode_op (HOST_WIDE_INT i, enum rtx_code code)
}
}
/* Emit a sequence of movs/adds/shift to produce a 32-bit constant.
Avoid generating useless code when one of the bytes is zero. */
void
thumb1_gen_const_int (rtx op0, HOST_WIDE_INT op1)
{
bool mov_done_p = false;
int i;
/* Emit upper 3 bytes if needed. */
for (i = 0; i < 3; i++)
{
int byte = (op1 >> (8 * (3 - i))) & 0xff;
if (byte)
{
emit_set_insn (op0, mov_done_p
? gen_rtx_PLUS (SImode,op0, GEN_INT (byte))
: GEN_INT (byte));
mov_done_p = true;
}
if (mov_done_p)
emit_set_insn (op0, gen_rtx_ASHIFT (SImode, op0, GEN_INT (8)));
}
/* Emit lower byte if needed. */
if (!mov_done_p)
emit_set_insn (op0, GEN_INT (op1 & 0xff));
else if (op1 & 0xff)
emit_set_insn (op0, gen_rtx_PLUS (SImode, op0, GEN_INT (op1 & 0xff)));
}
/* Emit a sequence of insns to handle a large constant.
CODE is the code of the operation required, it can be any of SET, PLUS,
IOR, AND, XOR, MINUS;
@ -28263,6 +28231,198 @@ arm_internal_label (FILE *stream, const char *prefix, unsigned long labelno)
default_internal_label (stream, prefix, labelno);
}
/* Define classes to generate code as RTL or output asm to a file.
Using templates then allows to use the same code to output code
sequences in the two formats. */
class thumb1_const_rtl
{
public:
thumb1_const_rtl (rtx dst) : dst (dst) {}
void mov (HOST_WIDE_INT val)
{
emit_set_insn (dst, GEN_INT (val));
}
void add (HOST_WIDE_INT val)
{
emit_set_insn (dst, gen_rtx_PLUS (SImode, dst, GEN_INT (val)));
}
void ashift (HOST_WIDE_INT shift)
{
emit_set_insn (dst, gen_rtx_ASHIFT (SImode, dst, GEN_INT (shift)));
}
void neg ()
{
emit_set_insn (dst, gen_rtx_NEG (SImode, dst));
}
private:
rtx dst;
};
class thumb1_const_print
{
public:
thumb1_const_print (FILE *f, int regno)
{
t_file = f;
dst_regname = reg_names[regno];
}
void mov (HOST_WIDE_INT val)
{
asm_fprintf (t_file, "\tmovs\t%s, #" HOST_WIDE_INT_PRINT_DEC "\n",
dst_regname, val);
}
void add (HOST_WIDE_INT val)
{
asm_fprintf (t_file, "\tadds\t%s, #" HOST_WIDE_INT_PRINT_DEC "\n",
dst_regname, val);
}
void ashift (HOST_WIDE_INT shift)
{
asm_fprintf (t_file, "\tlsls\t%s, #" HOST_WIDE_INT_PRINT_DEC "\n",
dst_regname, shift);
}
void neg ()
{
asm_fprintf (t_file, "\trsbs\t%s, #0\n", dst_regname);
}
private:
FILE *t_file;
const char *dst_regname;
};
/* Emit a sequence of movs/adds/shift to produce a 32-bit constant.
Avoid generating useless code when one of the bytes is zero. */
template <class T>
void
thumb1_gen_const_int_1 (T dst, HOST_WIDE_INT op1)
{
bool mov_done_p = false;
unsigned HOST_WIDE_INT val = op1;
int shift = 0;
int i;
gcc_assert (op1 == trunc_int_for_mode (op1, SImode));
if (val <= 255)
{
dst.mov (val);
return;
}
/* For negative numbers with the first nine bits set, build the
opposite of OP1, then negate it, it's generally shorter and not
longer. */
if ((val & 0xFF800000) == 0xFF800000)
{
thumb1_gen_const_int_1 (dst, -op1);
dst.neg ();
return;
}
/* In the general case, we need 7 instructions to build
a 32 bits constant (1 movs, 3 lsls, 3 adds). We can
do better if VAL is small enough, or
right-shiftable by a suitable amount. If the
right-shift enables to encode at least one less byte,
it's worth it: we save a adds and a lsls at the
expense of a final lsls. */
int final_shift = number_of_first_bit_set (val);
int leading_zeroes = clz_hwi (val);
int number_of_bytes_needed
= ((HOST_BITS_PER_WIDE_INT - 1 - leading_zeroes)
/ BITS_PER_UNIT) + 1;
int number_of_bytes_needed2
= ((HOST_BITS_PER_WIDE_INT - 1 - leading_zeroes - final_shift)
/ BITS_PER_UNIT) + 1;
if (number_of_bytes_needed2 < number_of_bytes_needed)
val >>= final_shift;
else
final_shift = 0;
/* If we are in a very small range, we can use either a single movs
or movs+adds. */
if (val <= 510)
{
if (val > 255)
{
unsigned HOST_WIDE_INT high = val - 255;
dst.mov (high);
dst.add (255);
}
else
dst.mov (val);
if (final_shift > 0)
dst.ashift (final_shift);
}
else
{
/* General case, emit upper 3 bytes as needed. */
for (i = 0; i < 3; i++)
{
unsigned HOST_WIDE_INT byte = (val >> (8 * (3 - i))) & 0xff;
if (byte)
{
/* We are about to emit new bits, stop accumulating a
shift amount, and left-shift only if we have already
emitted some upper bits. */
if (mov_done_p)
{
dst.ashift (shift);
dst.add (byte);
}
else
dst.mov (byte);
/* Stop accumulating shift amount since we've just
emitted some bits. */
shift = 0;
mov_done_p = true;
}
if (mov_done_p)
shift += 8;
}
/* Emit lower byte. */
if (!mov_done_p)
dst.mov (val & 0xff);
else
{
dst.ashift (shift);
if (val & 0xff)
dst.add (val & 0xff);
}
if (final_shift > 0)
dst.ashift (final_shift);
}
}
/* Proxy for thumb1.md, since the thumb1_const_print and
thumb1_const_rtl classes are not exported. */
void
thumb1_gen_const_int_rtl (rtx dst, HOST_WIDE_INT op1)
{
thumb1_const_rtl t (dst);
thumb1_gen_const_int_1 (t, op1);
}
/* Output code to add DELTA to the first argument, and then jump
to FUNCTION. Used for C++ multiple inheritance. */

View File

@ -820,7 +820,7 @@
&& !satisfies_constraint_K (operands[1])"
[(clobber (const_int 0))]
"
thumb1_gen_const_int (operands[0], INTVAL (operands[1]));
thumb1_gen_const_int_rtl (operands[0], INTVAL (operands[1]));
DONE;
"
)

View File

@ -0,0 +1,175 @@
/* { dg-do compile } */
/* { dg-options "-mpure-code -mcpu=cortex-m0 -march=armv6s-m -mthumb" } */
/* { dg-final { check-function-bodies "**" "" } } */
/* Does not use thumb1_gen_const_int.
** test_0:
** ...
** movs r[0-3], #0
** ...
*/
int
test_0 ()
{
return 0;
}
/* Does not use thumb1_gen_const_int.
** test_128:
** ...
** movs r[0-3], #128
** ...
*/
int
test_128 ()
{
return 128;
}
/* Does not use thumb1_gen_const_int.
** test_264:
** ...
** movs r[0-3], #132
** lsls r[0-3], r[0-3], #1
** ...
*/
int
test_264 ()
{
return 264;
}
/* Does not use thumb1_gen_const_int.
** test_510:
** ...
** movs r[0-3], #255
** lsls r[0-3], r[0-3], #1
** ...
*/
int
test_510 ()
{
return 510;
}
/* Does not use thumb1_gen_const_int.
** test_512:
** ...
** movs r[0-3], #128
** lsls r[0-3], r[0-3], #2
** ...
*/
int
test_512 ()
{
return 512;
}
/* Does not use thumb1_gen_const_int.
** test_764:
** ...
** movs r[0-3], #191
** lsls r[0-3], r[0-3], #2
** ...
*/
int
test_764 ()
{
return 764;
}
/* Does not use thumb1_gen_const_int.
** test_65536:
** ...
** movs r[0-3], #128
** lsls r[0-3], r[0-3], #9
** ...
*/
int
test_65536 ()
{
return 65536;
}
/*
** test_0x123456:
** ...
** movs r[0-3], #18
** lsls r[0-3], r[0-3], #8
** adds r[0-3], r[0-3], #52
** lsls r[0-3], r[0-3], #8
** adds r[0-3], r[0-3], #86
** ...
*/
int
test_0x123456 ()
{
return 0x123456;
}
/*
** test_0x1123456:
** ...
** movs r[0-3], #137
** lsls r[0-3], r[0-3], #8
** adds r[0-3], r[0-3], #26
** lsls r[0-3], r[0-3], #8
** adds r[0-3], r[0-3], #43
** lsls r[0-3], r[0-3], #1
** ...
*/
int
test_0x1123456 ()
{
return 0x1123456;
}
/* With -Os, we generate:
movs r0, #16
lsls r0, r0, r0
With the other optimization levels, we generate:
movs r0, #16
lsls r0, r0, #16
hence the two alternatives. */
/*
** test_0x1000010:
** ...
** movs r[0-3], #16
** lsls r[0-3], r[0-3], (#16|r[0-3])
** adds r[0-3], r[0-3], #1
** lsls r[0-3], r[0-3], #4
** ...
*/
int
test_0x1000010 ()
{
return 0x1000010;
}
/*
** test_0x1000011:
** ...
** movs r[0-3], #1
** lsls r[0-3], r[0-3], #24
** adds r[0-3], r[0-3], #17
** ...
*/
int
test_0x1000011 ()
{
return 0x1000011;
}
/*
** test_m8192:
** ...
** movs r[0-3], #1
** lsls r[0-3], r[0-3], #13
** rsbs r[0-3], r[0-3], #0
** ...
*/
int
test_m8192 ()
{
return -8192;
}