mirror of
git://gcc.gnu.org/git/gcc.git
synced 2025-03-26 09:50:40 +08:00
S/390: z13 inline stpcpy implementation.
A handwritten loop for stpcpy using the new z13 vector instructions appears to be much faster than the millicoded instruction. However, the implementation is much longer and therefore will only be enabled when optimization for speed. gcc/testsuite/ChangeLog: * gcc.target/s390/md/movstr-2.c: New test. gcc/ChangeLog: * config/s390/s390-protos.h: Add s390_expand_vec_movstr prototype. * config/s390/s390.c (s390_expand_vec_movstr): New function. * config/s390/s390.md ("movstr<P:mode>"): Call s390_expand_vec_movstr. From-SVN: r233550
This commit is contained in:
parent
9a36359ec6
commit
859a4c0e84
@ -1,3 +1,10 @@
|
||||
2016-02-19 Andreas Krebbel <krebbel@linux.vnet.ibm.com>
|
||||
|
||||
* config/s390/s390-protos.h: Add s390_expand_vec_movstr prototype.
|
||||
* config/s390/s390.c (s390_expand_vec_movstr): New function.
|
||||
* config/s390/s390.md ("movstr<P:mode>"): Call
|
||||
s390_expand_vec_movstr.
|
||||
|
||||
2016-02-19 Andreas Krebbel <krebbel@linux.vnet.ibm.com>
|
||||
|
||||
* config/s390/s390.md: Add missing output modifier for operand 1
|
||||
|
@ -109,6 +109,7 @@ extern bool s390_expand_movmem (rtx, rtx, rtx);
|
||||
extern void s390_expand_setmem (rtx, rtx, rtx);
|
||||
extern bool s390_expand_cmpmem (rtx, rtx, rtx, rtx);
|
||||
extern void s390_expand_vec_strlen (rtx, rtx, rtx);
|
||||
extern void s390_expand_vec_movstr (rtx, rtx, rtx);
|
||||
extern bool s390_expand_addcc (enum rtx_code, rtx, rtx, rtx, rtx, rtx);
|
||||
extern bool s390_expand_insv (rtx, rtx, rtx, rtx);
|
||||
extern void s390_expand_cs_hqi (machine_mode, rtx, rtx, rtx,
|
||||
|
@ -5622,6 +5622,124 @@ s390_expand_vec_strlen (rtx target, rtx string, rtx alignment)
|
||||
emit_move_insn (target, temp);
|
||||
}
|
||||
|
||||
void
|
||||
s390_expand_vec_movstr (rtx result, rtx dst, rtx src)
|
||||
{
|
||||
int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
|
||||
rtx temp = gen_reg_rtx (Pmode);
|
||||
rtx src_addr = XEXP (src, 0);
|
||||
rtx dst_addr = XEXP (dst, 0);
|
||||
rtx src_addr_reg = gen_reg_rtx (Pmode);
|
||||
rtx dst_addr_reg = gen_reg_rtx (Pmode);
|
||||
rtx offset = gen_reg_rtx (Pmode);
|
||||
rtx vsrc = gen_reg_rtx (V16QImode);
|
||||
rtx vpos = gen_reg_rtx (V16QImode);
|
||||
rtx loadlen = gen_reg_rtx (SImode);
|
||||
rtx gpos_qi = gen_reg_rtx(QImode);
|
||||
rtx gpos = gen_reg_rtx (SImode);
|
||||
rtx done_label = gen_label_rtx ();
|
||||
rtx loop_label = gen_label_rtx ();
|
||||
rtx exit_label = gen_label_rtx ();
|
||||
rtx full_label = gen_label_rtx ();
|
||||
|
||||
/* Perform a quick check for string ending on the first up to 16
|
||||
bytes and exit early if successful. */
|
||||
|
||||
emit_insn (gen_vlbb (vsrc, src, GEN_INT (6)));
|
||||
emit_insn (gen_lcbb (loadlen, src_addr, GEN_INT (6)));
|
||||
emit_insn (gen_vfenezv16qi (vpos, vsrc, vsrc));
|
||||
emit_insn (gen_vec_extractv16qi (gpos_qi, vpos, GEN_INT (7)));
|
||||
emit_move_insn (gpos, gen_rtx_SUBREG (SImode, gpos_qi, 0));
|
||||
/* gpos is the byte index if a zero was found and 16 otherwise.
|
||||
So if it is lower than the loaded bytes we have a hit. */
|
||||
emit_cmp_and_jump_insns (gpos, loadlen, GE, NULL_RTX, SImode, 1,
|
||||
full_label);
|
||||
emit_insn (gen_vstlv16qi (vsrc, gpos, dst));
|
||||
|
||||
force_expand_binop (Pmode, add_optab, dst_addr, gpos, result,
|
||||
1, OPTAB_DIRECT);
|
||||
emit_jump (exit_label);
|
||||
emit_barrier ();
|
||||
|
||||
emit_label (full_label);
|
||||
LABEL_NUSES (full_label) = 1;
|
||||
|
||||
/* Calculate `offset' so that src + offset points to the last byte
|
||||
before 16 byte alignment. */
|
||||
|
||||
/* temp = src_addr & 0xf */
|
||||
force_expand_binop (Pmode, and_optab, src_addr, GEN_INT (15), temp,
|
||||
1, OPTAB_DIRECT);
|
||||
|
||||
/* offset = 0xf - temp */
|
||||
emit_move_insn (offset, GEN_INT (15));
|
||||
force_expand_binop (Pmode, sub_optab, offset, temp, offset,
|
||||
1, OPTAB_DIRECT);
|
||||
|
||||
/* Store `offset' bytes in the dstination string. The quick check
|
||||
has loaded at least `offset' bytes into vsrc. */
|
||||
|
||||
emit_insn (gen_vstlv16qi (vsrc, gen_lowpart (SImode, offset), dst));
|
||||
|
||||
/* Advance to the next byte to be loaded. */
|
||||
force_expand_binop (Pmode, add_optab, offset, const1_rtx, offset,
|
||||
1, OPTAB_DIRECT);
|
||||
|
||||
/* Make sure the addresses are single regs which can be used as a
|
||||
base. */
|
||||
emit_move_insn (src_addr_reg, src_addr);
|
||||
emit_move_insn (dst_addr_reg, dst_addr);
|
||||
|
||||
/* MAIN LOOP */
|
||||
|
||||
emit_label (loop_label);
|
||||
LABEL_NUSES (loop_label) = 1;
|
||||
|
||||
emit_move_insn (vsrc,
|
||||
gen_rtx_MEM (V16QImode,
|
||||
gen_rtx_PLUS (Pmode, src_addr_reg, offset)));
|
||||
|
||||
emit_insn (gen_vec_vfenesv16qi (vpos, vsrc, vsrc,
|
||||
GEN_INT (VSTRING_FLAG_ZS | VSTRING_FLAG_CS)));
|
||||
add_int_reg_note (s390_emit_ccraw_jump (8, EQ, done_label),
|
||||
REG_BR_PROB, very_unlikely);
|
||||
|
||||
emit_move_insn (gen_rtx_MEM (V16QImode,
|
||||
gen_rtx_PLUS (Pmode, dst_addr_reg, offset)),
|
||||
vsrc);
|
||||
/* offset += 16 */
|
||||
force_expand_binop (Pmode, add_optab, offset, GEN_INT (16),
|
||||
offset, 1, OPTAB_DIRECT);
|
||||
|
||||
emit_jump (loop_label);
|
||||
emit_barrier ();
|
||||
|
||||
/* REGULAR EXIT */
|
||||
|
||||
/* We are done. Add the offset of the zero character to the dst_addr
|
||||
pointer to get the result. */
|
||||
|
||||
emit_label (done_label);
|
||||
LABEL_NUSES (done_label) = 1;
|
||||
|
||||
force_expand_binop (Pmode, add_optab, dst_addr_reg, offset, dst_addr_reg,
|
||||
1, OPTAB_DIRECT);
|
||||
|
||||
emit_insn (gen_vec_extractv16qi (gpos_qi, vpos, GEN_INT (7)));
|
||||
emit_move_insn (gpos, gen_rtx_SUBREG (SImode, gpos_qi, 0));
|
||||
|
||||
emit_insn (gen_vstlv16qi (vsrc, gpos, gen_rtx_MEM (BLKmode, dst_addr_reg)));
|
||||
|
||||
force_expand_binop (Pmode, add_optab, dst_addr_reg, gpos, result,
|
||||
1, OPTAB_DIRECT);
|
||||
|
||||
/* EARLY EXIT */
|
||||
|
||||
emit_label (exit_label);
|
||||
LABEL_NUSES (exit_label) = 1;
|
||||
}
|
||||
|
||||
|
||||
/* Expand conditional increment or decrement using alc/slb instructions.
|
||||
Should generate code setting DST to either SRC or SRC + INCREMENT,
|
||||
depending on the result of the comparison CMP_OP0 CMP_CODE CMP_OP1.
|
||||
|
@ -2953,8 +2953,16 @@
|
||||
(clobber (reg:CC CC_REGNUM))])]
|
||||
""
|
||||
{
|
||||
rtx addr1 = gen_reg_rtx (Pmode);
|
||||
rtx addr2 = gen_reg_rtx (Pmode);
|
||||
rtx addr1, addr2;
|
||||
|
||||
if (TARGET_VX && optimize_function_for_speed_p (cfun))
|
||||
{
|
||||
s390_expand_vec_movstr (operands[0], operands[1], operands[2]);
|
||||
DONE;
|
||||
}
|
||||
|
||||
addr1 = gen_reg_rtx (Pmode);
|
||||
addr2 = gen_reg_rtx (Pmode);
|
||||
|
||||
emit_move_insn (addr1, force_operand (XEXP (operands[1], 0), NULL_RTX));
|
||||
emit_move_insn (addr2, force_operand (XEXP (operands[2], 0), NULL_RTX));
|
||||
|
@ -1,3 +1,7 @@
|
||||
2016-02-19 Andreas Krebbel <krebbel@linux.vnet.ibm.com>
|
||||
|
||||
* gcc.target/s390/md/movstr-2.c: New test.
|
||||
|
||||
2016-02-19 Marcin Kościelnicki <koriakin@0x04.net>
|
||||
|
||||
* gcc.target/s390/morestack.c: New test.
|
||||
|
98
gcc/testsuite/gcc.target/s390/md/movstr-2.c
Normal file
98
gcc/testsuite/gcc.target/s390/md/movstr-2.c
Normal file
@ -0,0 +1,98 @@
|
||||
/* The z13 stpcpy implementation plays some alignment tricks for good
|
||||
performance. This test tries to make sure it works correctly and
|
||||
does not access bytes beyond the source and destination
|
||||
strings. */
|
||||
|
||||
/* { dg-do run } */
|
||||
|
||||
#include <stdio.h>
|
||||
#include <sys/mman.h>
|
||||
|
||||
#define PAGE_SIZE 4096
|
||||
|
||||
struct {
|
||||
char unused[PAGE_SIZE - 32];
|
||||
char m32[15]; /* page bndry - 32 */
|
||||
char m17[1];
|
||||
char m16[1];
|
||||
char m15[14];
|
||||
char m1[1];
|
||||
char next_page[PAGE_SIZE];
|
||||
} s, d __attribute__((aligned(PAGE_SIZE)));
|
||||
|
||||
char *__attribute__((noinline))
|
||||
my_stpcpy(char *dest, const char *src)
|
||||
{
|
||||
return __builtin_stpcpy (dest, src);
|
||||
}
|
||||
|
||||
void __attribute__ ((noinline))
|
||||
check (char *dest, char *src, size_t len)
|
||||
{
|
||||
char *result;
|
||||
|
||||
result = my_stpcpy (dest, src);
|
||||
if (result != dest + len)
|
||||
__builtin_abort ();
|
||||
if (__builtin_memcmp (src, dest, len) != 0)
|
||||
__builtin_abort ();
|
||||
}
|
||||
|
||||
int
|
||||
main ()
|
||||
{
|
||||
char *src[5] = { s.m32, s.m17, s.m16, s.m15, s.m1 };
|
||||
char *dst[5] = { d.m32, d.m17, d.m16, d.m15, d.m1 };
|
||||
int len[8] = { 33, 32, 31, 17, 16, 15, 1, 0 };
|
||||
int i, j, k;
|
||||
char backup;
|
||||
|
||||
for (i = 0; i < sizeof (s); i++)
|
||||
((char*)&s)[i] = i % 26 + 97;
|
||||
|
||||
for (i = 0; i < 5; i++)
|
||||
for (j = 0; j < 5; j++)
|
||||
for (k = 0; k < 8; k++)
|
||||
{
|
||||
backup = src[j][len[k]];
|
||||
src[j][len[k]] = 0;
|
||||
__builtin_memset (&d, 0, sizeof (d));
|
||||
check (dst[i], src[j], len[k]);
|
||||
src[j][len[k]] = backup;
|
||||
}
|
||||
|
||||
/* Make all source strings end before the page boundary. */
|
||||
backup = s.m1[0];
|
||||
s.m1[0] = 0;
|
||||
|
||||
if (mprotect (&s.next_page, PAGE_SIZE, PROT_NONE) == -1)
|
||||
perror ("mprotect src");
|
||||
|
||||
for (i = 0; i < 5; i++)
|
||||
for (j = 0; j < 5; j++)
|
||||
check (dst[i], src[j],
|
||||
PAGE_SIZE - ((unsigned long)src[j] & ((1UL << 12) - 1)) - 1);
|
||||
|
||||
if (mprotect (&s.next_page, PAGE_SIZE, PROT_READ | PROT_WRITE) == -1)
|
||||
perror ("mprotect src");
|
||||
|
||||
s.m1[0] = backup;
|
||||
|
||||
if (mprotect (&d.next_page, PAGE_SIZE, PROT_NONE) == -1)
|
||||
perror ("mprotect dst");
|
||||
|
||||
for (i = 0; i < 5; i++)
|
||||
for (j = 0; j < 5; j++)
|
||||
{
|
||||
int len = PAGE_SIZE - ((unsigned long)dst[i] & ((1UL << 12) - 1)) - 1;
|
||||
char backup = src[j][len];
|
||||
|
||||
src[j][len] = 0;
|
||||
__builtin_memset (&d, 0,
|
||||
(unsigned long)&d.next_page - (unsigned long)&d);
|
||||
check (dst[i], src[j], len);
|
||||
src[j][len] = backup;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user