mirror of
git://gcc.gnu.org/git/gcc.git
synced 2025-04-04 20:01:21 +08:00
i386.md (ashlsi3): Revise comments.
* i386.md (ashlsi3): Revise comments. Provide new anonymous pattern for Pentium and PPro/PII. Reverse constraints in generic ashlsi3 anonymous pattern. From-SVN: r25647
This commit is contained in:
parent
959f3a0667
commit
24883a4a12
@ -12,6 +12,10 @@ Mon Mar 8 16:04:44 1999 Jim Wilson <wilson@cygnus.com>
|
||||
|
||||
Mon Mar 8 15:27:42 1999 Jeffrey A Law (law@cygnus.com)
|
||||
|
||||
* i386.md (ashlsi3): Revise comments. Provide new anonymous
|
||||
pattern for Pentium and PPro/PII. Reverse constraints in
|
||||
generic ashlsi3 anonymous pattern.
|
||||
|
||||
* calls.c (initialize_argument_info): Accept a pointer to
|
||||
CUMULATIVE_ARGS.
|
||||
(expand_call): Pass the address of CUMULATIVE_ARGS.
|
||||
|
@ -4709,10 +4709,6 @@ byte_xor_operation:
|
||||
RET;
|
||||
}")
|
||||
|
||||
;; On i386 and i486, "addl reg,reg" is faster than "sall $1,reg"
|
||||
;; On i486, movl/sall appears slightly faster than leal, but the leal
|
||||
;; is smaller - use leal for now unless the shift count is 1.
|
||||
|
||||
(define_expand "ashlsi3"
|
||||
[(set (match_operand:SI 0 "nonimmediate_operand" "")
|
||||
(ashift:SI (match_operand:SI 1 "nonimmediate_operand" "")
|
||||
@ -4720,19 +4716,20 @@ byte_xor_operation:
|
||||
""
|
||||
"")
|
||||
|
||||
;; For register destinations:
|
||||
;; add == 2 bytes, move == 2 bytes, shift == 3 bytes, lea == 7 bytes
|
||||
;; Optimizing for code size:
|
||||
;; For regsiter destinations:
|
||||
;; add == 2 bytes, move == 2 bytes, shift == 3 bytes, lea == 7 bytes
|
||||
;;
|
||||
;; lea loses when optimizing for size
|
||||
;; lea loses when optimizing for size
|
||||
;;
|
||||
;; Do the math. If the count is 1, using add, else using sal will
|
||||
;; produce the smallest possible code, even when the source and
|
||||
;; dest do not match. For a memory destination, sal is the only
|
||||
;; choice.
|
||||
;; Do the math. If the count is 1, using add, else using sal will
|
||||
;; produce the smallest possible code, even when the source and
|
||||
;; dest do not match. For a memory destination, sal is the only
|
||||
;; choice.
|
||||
;;
|
||||
;; Do not try to handle case where src and dest do not match. Let regmove
|
||||
;; and reload handle them. A mov followed by this insn will generate the
|
||||
;; desired size optimized results.
|
||||
;; Do not try to handle case where src and dest do not match. Let regmove
|
||||
;; and reload handle them. A mov followed by this insn will generate the
|
||||
;; desired size optimized results.
|
||||
(define_insn ""
|
||||
[(set (match_operand:SI 0 "nonimmediate_operand" "=rm")
|
||||
(ashift:SI (match_operand:SI 1 "nonimmediate_operand" "0")
|
||||
@ -4748,11 +4745,144 @@ byte_xor_operation:
|
||||
return AS2 (sal%L0,%2,%0);
|
||||
}")
|
||||
|
||||
;; For Pentium/Pentium MMX:
|
||||
;;
|
||||
;; We want to optimize for pairability, but avoid generating AGI stalls.
|
||||
;;
|
||||
;; If this insn is expected to issue in the U pipe, then prefer sal,
|
||||
;; else prefer lea for small shifts when srcreg == dstreg.
|
||||
;;
|
||||
;; For PPro/PII
|
||||
;;
|
||||
;; There's more than one approach to optimizing for this family; it is
|
||||
;; unclear which approach is best. For now, we will try to minimize
|
||||
;; uops. Note that sal and lea have the same characteristics, so we
|
||||
;; prefer sal as it takes less space.
|
||||
;;
|
||||
;; We can actually share code for these two cases since the basic techniques
|
||||
;; for generating good code on these chips is the same, even if the final
|
||||
;; code sequences are different.
|
||||
;;
|
||||
;; I do not know what is most appropriate for the AMD or Cyrix chips.
|
||||
;;
|
||||
;; srcreg == dstreg, constant shift count:
|
||||
;;
|
||||
;; For a shift count of one, use "add".
|
||||
;; For a shift count of two or three, use "sal"/"lea" for Pentium and
|
||||
;; Pentium MMX depending on which pipe the insn will execute.
|
||||
;; All others use "sar".
|
||||
;;
|
||||
;; srcreg != dstreg, constant shift count:
|
||||
;;
|
||||
;; For shift counts of one to three, use "lea".
|
||||
;; All others use "lea" for the first shift into the destination reg,
|
||||
;; then fall back on the srcreg == dstreg for the residual shifts.
|
||||
;;
|
||||
;; memory destinations or nonconstant shift count:
|
||||
;;
|
||||
;; Use "sal".
|
||||
;;
|
||||
(define_insn ""
|
||||
[(set (match_operand:SI 0 "nonimmediate_operand" "=r,rm")
|
||||
(ashift:SI (match_operand:SI 1 "nonimmediate_operand" "r,0")
|
||||
(match_operand:SI 2 "nonmemory_operand" "M,cI")))]
|
||||
"! optimize_size"
|
||||
[(set (match_operand:SI 0 "nonimmediate_operand" "=rm,r")
|
||||
(ashift:SI (match_operand:SI 1 "nonimmediate_operand" "0,r")
|
||||
(match_operand:SI 2 "nonmemory_operand" "cI,I")))]
|
||||
"! optimize_size
|
||||
&& ((int)ix86_cpu == (int)PROCESSOR_PENTIUM
|
||||
|| (int)ix86_cpu == (int)PROCESSOR_PENTIUMPRO)"
|
||||
"*
|
||||
{
|
||||
/* This should be extremely rare (impossible?). We can not encode a shift
|
||||
of the stack pointer using an lea instruction. So copy the stack pointer
|
||||
into the destination register and fall into the srcreg == dstreg shifting
|
||||
support. */
|
||||
if (operands[1] == stack_pointer_rtx)
|
||||
{
|
||||
output_asm_insn (AS2 (mov%L0,%1,%0), operands);
|
||||
operands[1] = operands[0];
|
||||
}
|
||||
|
||||
/* Handle case where srcreg != dstreg. */
|
||||
if (REG_P (operands[0]) && REGNO (operands[0]) != REGNO (operands[1]))
|
||||
{
|
||||
/* For counts > 3, it is easiest to split into component insns. */
|
||||
if (INTVAL (operands[2]) > 3)
|
||||
return \"#\";
|
||||
|
||||
/* For shifts up to and including 3 bits, use lea. */
|
||||
operands[1] = gen_rtx_MULT (SImode, operands[1],
|
||||
GEN_INT (1 << INTVAL (operands[2])));
|
||||
return AS2 (lea%L0,%a1,%0);
|
||||
}
|
||||
|
||||
/* Source and destination match. */
|
||||
|
||||
/* Handle variable shift. */
|
||||
if (REG_P (operands[2]))
|
||||
return AS2 (sal%L0,%b2,%0);
|
||||
|
||||
/* Always perform shift by 1 using an add instruction. */
|
||||
if (REG_P (operands[0]) && operands[2] == const1_rtx)
|
||||
return AS2 (add%L0,%0,%0);
|
||||
|
||||
#if 0
|
||||
/* ??? Currently disabled. reg-stack currently stomps on the mode of
|
||||
each insn. Thus, we can not easily detect when we should use lea to
|
||||
improve issue characteristics. Until reg-stack is fixed, fall back to
|
||||
sal instruction for Pentiums to avoid AGI stall. */
|
||||
/* Shift reg by 2 or 3 use an lea instruction for Pentium if this is
|
||||
insn is expected to issue into the V pipe (the insn's mode will be
|
||||
TImode for a U pipe, and !TImode for a V pipe instruction). */
|
||||
if (REG_P (operands[0])
|
||||
&& GET_CODE (operands[2]) == CONST_INT
|
||||
&& INTVAL (operands[2]) <= 3
|
||||
&& (int)ix86_cpu == (int)PROCESSOR_PENTIUM
|
||||
&& GET_MODE (insn) != TImode)
|
||||
{
|
||||
operands[1] = gen_rtx_MULT (SImode, operands[1],
|
||||
GEN_INT (1 << INTVAL (operands[2])));
|
||||
return AS2 (lea%L0,%a1,%0);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Otherwise use a shift instruction. */
|
||||
return AS2 (sal%L0,%2,%0);
|
||||
}")
|
||||
|
||||
;; Pentium/PPro/PII Splitter used when srcreg != destreg and shift
|
||||
;; count is > 3. In each case we use lea to perform the first three
|
||||
;; shifts into the destination register, then we fall back to the
|
||||
;; normal shifting code for the residual shifts.
|
||||
(define_split
|
||||
[(set (match_operand:SI 0 "register_operand" "=r")
|
||||
(ashift:SI (match_operand:SI 1 "register_operand" "r")
|
||||
(match_operand:SI 2 "immediate_operand" "I")))]
|
||||
"reload_completed
|
||||
&& ! optimize_size
|
||||
&& ((int)ix86_cpu == (int)PROCESSOR_PENTIUM
|
||||
|| (int)ix86_cpu == (int)PROCESSOR_PENTIUMPRO)
|
||||
&& GET_CODE (operands[2]) == CONST_INT
|
||||
&& INTVAL (operands[2]) > 3
|
||||
&& true_regnum (operands[0]) != true_regnum (operands[1])"
|
||||
[(set (match_dup 0) (ashift:SI (match_dup 1) (match_dup 2)))
|
||||
(set (match_dup 0) (ashift:SI (match_dup 0) (match_dup 3)))]
|
||||
"
|
||||
{
|
||||
operands[3] = GEN_INT (INTVAL (operands[2] - 3));
|
||||
operands[2] = GEN_INT (3);
|
||||
}")
|
||||
|
||||
|
||||
;; On i386 and i486, "addl reg,reg" is faster than "sall $1,reg"
|
||||
;; On i486, movl/sall appears slightly faster than leal, but the leal
|
||||
;; is smaller - use leal for now unless the shift count is 1.
|
||||
;;
|
||||
(define_insn ""
|
||||
[(set (match_operand:SI 0 "nonimmediate_operand" "=rm,r")
|
||||
(ashift:SI (match_operand:SI 1 "nonimmediate_operand" "0,r")
|
||||
(match_operand:SI 2 "nonmemory_operand" "cI,M")))]
|
||||
"! optimize_size
|
||||
&& ! ((int)ix86_cpu == (int)PROCESSOR_PENTIUM
|
||||
|| (int)ix86_cpu == (int)PROCESSOR_PENTIUMPRO)"
|
||||
"*
|
||||
{
|
||||
if (REG_P (operands[0]) && REGNO (operands[0]) != REGNO (operands[1]))
|
||||
|
Loading…
x
Reference in New Issue
Block a user