From 24883a4a12adbcd0f1d9c876dff34cab9038027d Mon Sep 17 00:00:00 2001
From: Jeffrey A Law <law@cygnus.com>
Date: Mon, 8 Mar 1999 23:31:28 +0000
Subject: [PATCH] i386.md (ashlsi3): Revise comments.

        * i386.md (ashlsi3): Revise comments.  Provide new anonymous
        pattern for Pentium and PPro/PII.  Reverse constraints in
        generic ashlsi3 anonymous pattern.

From-SVN: r25647
---
 gcc/ChangeLog           |   4 +
 gcc/config/i386/i386.md | 166 +++++++++++++++++++++++++++++++++++-----
 2 files changed, 152 insertions(+), 18 deletions(-)

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index d70c60fe80db..e61d29362ba0 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -12,6 +12,10 @@ Mon Mar  8 16:04:44 1999  Jim Wilson  <wilson@cygnus.com>
 
 Mon Mar  8 15:27:42 1999  Jeffrey A Law  (law@cygnus.com)
 
+	* i386.md (ashlsi3): Revise comments.  Provide new anonymous
+	pattern for Pentium and PPro/PII.  Reverse constraints in 
+	generic ashlsi3 anonymous pattern.
+
 	* calls.c (initialize_argument_info): Accept a pointer to 
 	CUMULATIVE_ARGS.
 	(expand_call): Pass the address of CUMULATIVE_ARGS.
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index c68d5f8d099a..8ae917c1801b 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -4709,10 +4709,6 @@ byte_xor_operation:
   RET;
 }")
 
-;; On i386 and i486, "addl reg,reg" is faster than "sall $1,reg"
-;; On i486, movl/sall appears slightly faster than leal, but the leal
-;; is smaller - use leal for now unless the shift count is 1.
-
 (define_expand "ashlsi3"
   [(set (match_operand:SI 0 "nonimmediate_operand" "")
 	(ashift:SI (match_operand:SI 1 "nonimmediate_operand" "")
@@ -4720,19 +4716,20 @@ byte_xor_operation:
   ""
   "")
 
-;; For register destinations:
-;;   add == 2 bytes, move == 2 bytes, shift == 3 bytes, lea == 7 bytes
+;; Optimizing for code size:
+;;   For regsiter destinations:
+;;     add == 2 bytes, move == 2 bytes, shift == 3 bytes, lea == 7 bytes
 ;;
-;;   lea loses when optimizing for size
+;;     lea loses when optimizing for size
 ;;
-;; Do the math.  If the count is 1, using add, else using sal will
-;; produce the smallest possible code, even when the source and
-;; dest do not match.  For a memory destination, sal is the only
-;; choice.
+;;   Do the math.  If the count is 1, using add, else using sal will
+;;   produce the smallest possible code, even when the source and
+;;   dest do not match.  For a memory destination, sal is the only
+;;   choice.
 ;;
-;; Do not try to handle case where src and dest do not match.  Let regmove
-;; and reload handle them.  A mov followed by this insn will generate the
-;; desired size optimized results.
+;;   Do not try to handle case where src and dest do not match.  Let regmove
+;;   and reload handle them.  A mov followed by this insn will generate the
+;;   desired size optimized results.
 (define_insn ""
   [(set (match_operand:SI 0 "nonimmediate_operand" "=rm")
 	(ashift:SI (match_operand:SI 1 "nonimmediate_operand" "0")
@@ -4748,11 +4745,144 @@ byte_xor_operation:
   return AS2 (sal%L0,%2,%0);
 }")
 
+;; For Pentium/Pentium MMX:
+;;
+;;   We want to optimize for pairability, but avoid generating AGI stalls.
+;;
+;;   If this insn is expected to issue in the U pipe, then prefer sal,
+;;   else prefer lea for small shifts when srcreg == dstreg.
+;;
+;; For PPro/PII
+;;
+;;   There's more than one approach to optimizing for this family; it is
+;;   unclear which approach is best.  For now, we will try to minimize
+;;   uops.  Note that sal and lea have the same characteristics, so we
+;;   prefer sal as it takes less space.
+;;
+;; We can actually share code for these two cases since the basic techniques
+;; for generating good code on these chips is the same, even if the final
+;; code sequences are different.
+;;
+;; I do not know what is most appropriate for the AMD or Cyrix chips.
+;;
+;;   srcreg == dstreg, constant shift count:
+;;
+;;     For a shift count of one, use "add".
+;;     For a shift count of two or three, use "sal"/"lea" for Pentium and
+;;     Pentium MMX depending on which pipe the insn will execute.
+;;     All others use "sar".
+;;
+;;   srcreg != dstreg, constant shift count:
+;;
+;;     For shift counts of one to three, use "lea".
+;;     All others use "lea" for the first shift into the destination reg,
+;;     then fall back on the srcreg == dstreg for the residual shifts.
+;;
+;;   memory destinations or nonconstant shift count:
+;;
+;;     Use "sal".
+;;
 (define_insn ""
-  [(set (match_operand:SI 0 "nonimmediate_operand" "=r,rm")
-	(ashift:SI (match_operand:SI 1 "nonimmediate_operand" "r,0")
-		   (match_operand:SI 2 "nonmemory_operand" "M,cI")))]
-  "! optimize_size"
+  [(set (match_operand:SI 0 "nonimmediate_operand" "=rm,r")
+	(ashift:SI (match_operand:SI 1 "nonimmediate_operand" "0,r")
+		   (match_operand:SI 2 "nonmemory_operand" "cI,I")))]
+  "! optimize_size
+   && ((int)ix86_cpu == (int)PROCESSOR_PENTIUM
+       || (int)ix86_cpu == (int)PROCESSOR_PENTIUMPRO)"
+  "*
+{
+  /* This should be extremely rare (impossible?).  We can not encode a shift
+     of the stack pointer using an lea instruction.  So copy the stack pointer
+     into the destination register and fall into the srcreg == dstreg shifting
+     support.  */
+  if (operands[1] == stack_pointer_rtx)
+    {
+      output_asm_insn (AS2 (mov%L0,%1,%0), operands);
+      operands[1] = operands[0];
+    }
+
+  /* Handle case where srcreg != dstreg.  */
+  if (REG_P (operands[0]) && REGNO (operands[0]) != REGNO (operands[1]))
+    {
+      /* For counts > 3, it is easiest to split into component insns.  */
+      if (INTVAL (operands[2]) > 3)
+	return \"#\";
+    
+      /* For shifts up to and including 3 bits, use lea.  */
+      operands[1] = gen_rtx_MULT (SImode, operands[1],
+				  GEN_INT (1 << INTVAL (operands[2])));
+      return AS2 (lea%L0,%a1,%0);
+    }
+
+  /* Source and destination match.  */
+
+  /* Handle variable shift.  */
+  if (REG_P (operands[2]))
+    return AS2 (sal%L0,%b2,%0);
+
+  /* Always perform shift by 1 using an add instruction.  */
+  if (REG_P (operands[0]) && operands[2] == const1_rtx)
+    return AS2 (add%L0,%0,%0);
+
+#if 0
+  /* ??? Currently disabled.  reg-stack currently stomps on the mode of
+     each insn.  Thus, we can not easily detect when we should use lea to
+     improve issue characteristics.  Until reg-stack is fixed, fall back to
+     sal instruction for Pentiums to avoid AGI stall.  */
+  /* Shift reg by 2 or 3 use an lea instruction for Pentium if this is
+     insn is expected to issue into the V pipe (the insn's mode will be
+     TImode for a U pipe, and !TImode for a V pipe instruction).  */
+  if (REG_P (operands[0])
+      && GET_CODE (operands[2]) == CONST_INT
+      && INTVAL (operands[2]) <= 3
+      && (int)ix86_cpu == (int)PROCESSOR_PENTIUM
+      && GET_MODE (insn) != TImode)
+    {
+      operands[1] = gen_rtx_MULT (SImode, operands[1],
+				  GEN_INT (1 << INTVAL (operands[2])));
+      return AS2 (lea%L0,%a1,%0);
+    }
+#endif
+
+  /* Otherwise use a shift instruction.  */
+  return AS2 (sal%L0,%2,%0);
+}")
+
+;; Pentium/PPro/PII Splitter used when srcreg != destreg and shift
+;; count is > 3.  In each case we use lea to perform the first three
+;; shifts into the destination register, then we fall back to the
+;; normal shifting code for the residual shifts.
+(define_split
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(ashift:SI (match_operand:SI 1 "register_operand" "r")
+		   (match_operand:SI 2 "immediate_operand" "I")))]
+  "reload_completed
+   && ! optimize_size
+   && ((int)ix86_cpu == (int)PROCESSOR_PENTIUM
+       || (int)ix86_cpu == (int)PROCESSOR_PENTIUMPRO)
+   && GET_CODE (operands[2]) == CONST_INT
+   && INTVAL (operands[2]) > 3
+   && true_regnum (operands[0]) != true_regnum (operands[1])"
+  [(set (match_dup 0) (ashift:SI (match_dup 1) (match_dup 2)))
+   (set (match_dup 0) (ashift:SI (match_dup 0) (match_dup 3)))]
+  "
+{
+  operands[3] = GEN_INT (INTVAL (operands[2] - 3));
+  operands[2] = GEN_INT (3);
+}")
+
+
+;; On i386 and i486, "addl reg,reg" is faster than "sall $1,reg"
+;; On i486, movl/sall appears slightly faster than leal, but the leal
+;; is smaller - use leal for now unless the shift count is 1.
+;;
+(define_insn ""
+  [(set (match_operand:SI 0 "nonimmediate_operand" "=rm,r")
+	(ashift:SI (match_operand:SI 1 "nonimmediate_operand" "0,r")
+		   (match_operand:SI 2 "nonmemory_operand" "cI,M")))]
+  "! optimize_size
+   && ! ((int)ix86_cpu == (int)PROCESSOR_PENTIUM
+         || (int)ix86_cpu == (int)PROCESSOR_PENTIUMPRO)"
   "*
 {
   if (REG_P (operands[0]) && REGNO (operands[0]) != REGNO (operands[1]))