From 24883a4a12adbcd0f1d9c876dff34cab9038027d Mon Sep 17 00:00:00 2001 From: Jeffrey A Law Date: Mon, 8 Mar 1999 23:31:28 +0000 Subject: [PATCH] i386.md (ashlsi3): Revise comments. * i386.md (ashlsi3): Revise comments. Provide new anonymous pattern for Pentium and PPro/PII. Reverse constraints in generic ashlsi3 anonymous pattern. From-SVN: r25647 --- gcc/ChangeLog | 4 + gcc/config/i386/i386.md | 166 +++++++++++++++++++++++++++++++++++----- 2 files changed, 152 insertions(+), 18 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index d70c60fe80db..e61d29362ba0 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -12,6 +12,10 @@ Mon Mar 8 16:04:44 1999 Jim Wilson Mon Mar 8 15:27:42 1999 Jeffrey A Law (law@cygnus.com) + * i386.md (ashlsi3): Revise comments. Provide new anonymous + pattern for Pentium and PPro/PII. Reverse constraints in + generic ashlsi3 anonymous pattern. + * calls.c (initialize_argument_info): Accept a pointer to CUMULATIVE_ARGS. (expand_call): Pass the address of CUMULATIVE_ARGS. diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index c68d5f8d099a..8ae917c1801b 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -4709,10 +4709,6 @@ byte_xor_operation: RET; }") -;; On i386 and i486, "addl reg,reg" is faster than "sall $1,reg" -;; On i486, movl/sall appears slightly faster than leal, but the leal -;; is smaller - use leal for now unless the shift count is 1. - (define_expand "ashlsi3" [(set (match_operand:SI 0 "nonimmediate_operand" "") (ashift:SI (match_operand:SI 1 "nonimmediate_operand" "") @@ -4720,19 +4716,20 @@ byte_xor_operation: "" "") -;; For register destinations: -;; add == 2 bytes, move == 2 bytes, shift == 3 bytes, lea == 7 bytes +;; Optimizing for code size: +;; For regsiter destinations: +;; add == 2 bytes, move == 2 bytes, shift == 3 bytes, lea == 7 bytes ;; -;; lea loses when optimizing for size +;; lea loses when optimizing for size ;; -;; Do the math. If the count is 1, using add, else using sal will -;; produce the smallest possible code, even when the source and -;; dest do not match. For a memory destination, sal is the only -;; choice. +;; Do the math. If the count is 1, using add, else using sal will +;; produce the smallest possible code, even when the source and +;; dest do not match. For a memory destination, sal is the only +;; choice. ;; -;; Do not try to handle case where src and dest do not match. Let regmove -;; and reload handle them. A mov followed by this insn will generate the -;; desired size optimized results. +;; Do not try to handle case where src and dest do not match. Let regmove +;; and reload handle them. A mov followed by this insn will generate the +;; desired size optimized results. (define_insn "" [(set (match_operand:SI 0 "nonimmediate_operand" "=rm") (ashift:SI (match_operand:SI 1 "nonimmediate_operand" "0") @@ -4748,11 +4745,144 @@ byte_xor_operation: return AS2 (sal%L0,%2,%0); }") +;; For Pentium/Pentium MMX: +;; +;; We want to optimize for pairability, but avoid generating AGI stalls. +;; +;; If this insn is expected to issue in the U pipe, then prefer sal, +;; else prefer lea for small shifts when srcreg == dstreg. +;; +;; For PPro/PII +;; +;; There's more than one approach to optimizing for this family; it is +;; unclear which approach is best. For now, we will try to minimize +;; uops. Note that sal and lea have the same characteristics, so we +;; prefer sal as it takes less space. +;; +;; We can actually share code for these two cases since the basic techniques +;; for generating good code on these chips is the same, even if the final +;; code sequences are different. +;; +;; I do not know what is most appropriate for the AMD or Cyrix chips. +;; +;; srcreg == dstreg, constant shift count: +;; +;; For a shift count of one, use "add". +;; For a shift count of two or three, use "sal"/"lea" for Pentium and +;; Pentium MMX depending on which pipe the insn will execute. +;; All others use "sar". +;; +;; srcreg != dstreg, constant shift count: +;; +;; For shift counts of one to three, use "lea". +;; All others use "lea" for the first shift into the destination reg, +;; then fall back on the srcreg == dstreg for the residual shifts. +;; +;; memory destinations or nonconstant shift count: +;; +;; Use "sal". +;; (define_insn "" - [(set (match_operand:SI 0 "nonimmediate_operand" "=r,rm") - (ashift:SI (match_operand:SI 1 "nonimmediate_operand" "r,0") - (match_operand:SI 2 "nonmemory_operand" "M,cI")))] - "! optimize_size" + [(set (match_operand:SI 0 "nonimmediate_operand" "=rm,r") + (ashift:SI (match_operand:SI 1 "nonimmediate_operand" "0,r") + (match_operand:SI 2 "nonmemory_operand" "cI,I")))] + "! optimize_size + && ((int)ix86_cpu == (int)PROCESSOR_PENTIUM + || (int)ix86_cpu == (int)PROCESSOR_PENTIUMPRO)" + "* +{ + /* This should be extremely rare (impossible?). We can not encode a shift + of the stack pointer using an lea instruction. So copy the stack pointer + into the destination register and fall into the srcreg == dstreg shifting + support. */ + if (operands[1] == stack_pointer_rtx) + { + output_asm_insn (AS2 (mov%L0,%1,%0), operands); + operands[1] = operands[0]; + } + + /* Handle case where srcreg != dstreg. */ + if (REG_P (operands[0]) && REGNO (operands[0]) != REGNO (operands[1])) + { + /* For counts > 3, it is easiest to split into component insns. */ + if (INTVAL (operands[2]) > 3) + return \"#\"; + + /* For shifts up to and including 3 bits, use lea. */ + operands[1] = gen_rtx_MULT (SImode, operands[1], + GEN_INT (1 << INTVAL (operands[2]))); + return AS2 (lea%L0,%a1,%0); + } + + /* Source and destination match. */ + + /* Handle variable shift. */ + if (REG_P (operands[2])) + return AS2 (sal%L0,%b2,%0); + + /* Always perform shift by 1 using an add instruction. */ + if (REG_P (operands[0]) && operands[2] == const1_rtx) + return AS2 (add%L0,%0,%0); + +#if 0 + /* ??? Currently disabled. reg-stack currently stomps on the mode of + each insn. Thus, we can not easily detect when we should use lea to + improve issue characteristics. Until reg-stack is fixed, fall back to + sal instruction for Pentiums to avoid AGI stall. */ + /* Shift reg by 2 or 3 use an lea instruction for Pentium if this is + insn is expected to issue into the V pipe (the insn's mode will be + TImode for a U pipe, and !TImode for a V pipe instruction). */ + if (REG_P (operands[0]) + && GET_CODE (operands[2]) == CONST_INT + && INTVAL (operands[2]) <= 3 + && (int)ix86_cpu == (int)PROCESSOR_PENTIUM + && GET_MODE (insn) != TImode) + { + operands[1] = gen_rtx_MULT (SImode, operands[1], + GEN_INT (1 << INTVAL (operands[2]))); + return AS2 (lea%L0,%a1,%0); + } +#endif + + /* Otherwise use a shift instruction. */ + return AS2 (sal%L0,%2,%0); +}") + +;; Pentium/PPro/PII Splitter used when srcreg != destreg and shift +;; count is > 3. In each case we use lea to perform the first three +;; shifts into the destination register, then we fall back to the +;; normal shifting code for the residual shifts. +(define_split + [(set (match_operand:SI 0 "register_operand" "=r") + (ashift:SI (match_operand:SI 1 "register_operand" "r") + (match_operand:SI 2 "immediate_operand" "I")))] + "reload_completed + && ! optimize_size + && ((int)ix86_cpu == (int)PROCESSOR_PENTIUM + || (int)ix86_cpu == (int)PROCESSOR_PENTIUMPRO) + && GET_CODE (operands[2]) == CONST_INT + && INTVAL (operands[2]) > 3 + && true_regnum (operands[0]) != true_regnum (operands[1])" + [(set (match_dup 0) (ashift:SI (match_dup 1) (match_dup 2))) + (set (match_dup 0) (ashift:SI (match_dup 0) (match_dup 3)))] + " +{ + operands[3] = GEN_INT (INTVAL (operands[2] - 3)); + operands[2] = GEN_INT (3); +}") + + +;; On i386 and i486, "addl reg,reg" is faster than "sall $1,reg" +;; On i486, movl/sall appears slightly faster than leal, but the leal +;; is smaller - use leal for now unless the shift count is 1. +;; +(define_insn "" + [(set (match_operand:SI 0 "nonimmediate_operand" "=rm,r") + (ashift:SI (match_operand:SI 1 "nonimmediate_operand" "0,r") + (match_operand:SI 2 "nonmemory_operand" "cI,M")))] + "! optimize_size + && ! ((int)ix86_cpu == (int)PROCESSOR_PENTIUM + || (int)ix86_cpu == (int)PROCESSOR_PENTIUMPRO)" "* { if (REG_P (operands[0]) && REGNO (operands[0]) != REGNO (operands[1]))