genopinit.c (vec_shl_optab, [...]): Initialize new optabs.

* genopinit.c (vec_shl_optab, vec_shr_optab): Initialize new optabs. (reduc_plus_optab): Removed. Replcaed with... (reduc_splus_optab, reduc_uplus_optab): Initialize new optabs. * optabs.c (optab_for_tree_code): Return reduc_splus_optab or reduc_uplus_optab instead of reduc_plus_optab. (expand_vec_shift_expr): New function. (init_optabs): Initialize new optabs. Remove initialization of reduc_plus_optab. (optab_for_tree_code): Return vec_shl_optab/vec_shr_optab for VEC_LSHIFT_EXPR/VEC_RSHIFT_EXPR. * optabs.h (OTI_reduc_plus): Removed. Replaced with... (OTI_reduc_splus, OTI_reduc_uplus): New. (reduc_plus_optab): Removed. Replcaed with... (reduc_splus_optab, reduc_uplus_optab): New optabs. (vec_shl_optab, vec_shr_optab): New optabs. (expand_vec_shift_expr): New function declaration. * tree.def (VEC_LSHIFT_EXPR, VEC_RSHIFT_EXPR): New tree-codes. * tree-inline.c (estimate_num_insns_1): Handle new tree-codes. * expr.c (expand_expr_real_1): Handle new tree-codes. * tree-pretty-print.c (dump_generic_node, op_symbol, op_prio): Likewise. * tree-vect-generic.c (expand_vector_operations_1): Add assert. * tree-vect-transform.c (vect_create_epilog_for_reduction): Add two alternatives for generating reduction epilog code. (vectorizable_reduction): Don't fail of direct reduction support is not available. (vectorizable_target_reduction_pattern): Likewise. * config/rs6000/altivec.md (reduc_smax_v4si, reduc_smax_v4sf, reduc_umax_v4si, reduc_smin_v4si, reduc_smin_v4sf, reduc_umin_v4si, reduc_plus_v4si, reduc_plus_v4sf): Removed. (vec_shl_<mode>, vec_shr_<mode>, altivec_vsumsws_nomode, reduc_splus_<mode>, reduc_uplus_v16qi): New. From-SVN: r101231
2025-02-23 06:49:11 +08:00 · 2005-06-21 09:02:00 +00:00 · 2005-06-21 09:02:00 +00:00 · a6b46ba2c8
commit a6b46ba2c8
parent a3a2067ac5
25 changed files with 886 additions and 219 deletions
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@ -1,3 +1,40 @@
+2005-06-21  Dorit Nuzman  <dorit@il.ibm.com>
+
+	* genopinit.c (vec_shl_optab, vec_shr_optab): Initialize new optabs.
+	(reduc_plus_optab): Removed.  Replcaed with...
+	(reduc_splus_optab, reduc_uplus_optab): Initialize new optabs.
+	* optabs.c (optab_for_tree_code): Return reduc_splus_optab or
+	reduc_uplus_optab instead of reduc_plus_optab.
+	(expand_vec_shift_expr): New function.
+	(init_optabs): Initialize new optabs. Remove initialization of
+	reduc_plus_optab.
+	(optab_for_tree_code): Return vec_shl_optab/vec_shr_optab
+	for VEC_LSHIFT_EXPR/VEC_RSHIFT_EXPR.
+	* optabs.h (OTI_reduc_plus): Removed. Replaced with...
+	(OTI_reduc_splus, OTI_reduc_uplus): New.
+	(reduc_plus_optab): Removed.  Replcaed with...
+	(reduc_splus_optab, reduc_uplus_optab): New optabs.
+	(vec_shl_optab, vec_shr_optab): New optabs.
+	(expand_vec_shift_expr): New function declaration.
+
+	* tree.def (VEC_LSHIFT_EXPR, VEC_RSHIFT_EXPR): New tree-codes.
+	* tree-inline.c (estimate_num_insns_1): Handle new tree-codes.
+	* expr.c (expand_expr_real_1): Handle new tree-codes.
+	* tree-pretty-print.c (dump_generic_node, op_symbol, op_prio): Likewise.
+	* tree-vect-generic.c (expand_vector_operations_1): Add assert.
+
+	* tree-vect-transform.c (vect_create_epilog_for_reduction): Add two
+	alternatives for generating reduction epilog code.
+	(vectorizable_reduction): Don't fail of direct reduction support is
+	not available.
+	(vectorizable_target_reduction_pattern): Likewise.
+
+	* config/rs6000/altivec.md (reduc_smax_v4si, reduc_smax_v4sf,
+	reduc_umax_v4si, reduc_smin_v4si, reduc_smin_v4sf, reduc_umin_v4si,
+	reduc_plus_v4si, reduc_plus_v4sf): Removed.
+	(vec_shl_<mode>, vec_shr_<mode>, altivec_vsumsws_nomode,
+	reduc_splus_<mode>, reduc_uplus_v16qi): New.
+
 2005-06-20  Daniel Berlin  <dberlin@dberlin.org>

 	* c-typeck.c (build_function_call): Set fundecl = function again.
--- a/gcc/config/rs6000/altivec.md
+++ b/gcc/config/rs6000/altivec.md
@ -1825,157 +1825,100 @@
  operands[3] = gen_reg_rtx (GET_MODE (operands[0]));
 })

-;; Reduction
-
-(define_expand "reduc_smax_v4si"
-  [(set (match_operand:V4SI 0 "register_operand" "=v")
-        (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "v")] 217))]
-  "TARGET_ALTIVEC"
-  "
-{  
-  rtx vtmp1 = gen_reg_rtx (V4SImode);
-  rtx vtmp2 = gen_reg_rtx (V4SImode);
-  rtx vtmp3 = gen_reg_rtx (V4SImode);
-
-  emit_insn (gen_altivec_vsldoi_v4si (vtmp1, operands[1], operands[1], 
-				      gen_rtx_CONST_INT (SImode, 8)));
-  emit_insn (gen_smaxv4si3 (vtmp2, operands[1], vtmp1));
-  emit_insn (gen_altivec_vsldoi_v4si (vtmp3, vtmp2, vtmp2, 
-				      gen_rtx_CONST_INT (SImode, 4)));
-  emit_insn (gen_smaxv4si3 (operands[0], vtmp2, vtmp3));
-  DONE;
-}")
-
-(define_expand "reduc_smax_v4sf"
-  [(set (match_operand:V4SF 0 "register_operand" "=v")
-        (unspec:V4SF [(match_operand:V4SF 1 "register_operand" "v")] 217))]
-  "TARGET_ALTIVEC"
-  "
-{ 
-  rtx vtmp1 = gen_reg_rtx (V4SFmode);
-  rtx vtmp2 = gen_reg_rtx (V4SFmode);
-  rtx vtmp3 = gen_reg_rtx (V4SFmode);
-
-  emit_insn (gen_altivec_vsldoi_v4sf (vtmp1, operands[1], operands[1], 
-				      gen_rtx_CONST_INT (SImode, 8)));
-  emit_insn (gen_smaxv4sf3 (vtmp2, operands[1], vtmp1));
-  emit_insn (gen_altivec_vsldoi_v4sf (vtmp3, vtmp2, vtmp2, 
-				      gen_rtx_CONST_INT (SImode, 4)));
-  emit_insn (gen_smaxv4sf3 (operands[0], vtmp2, vtmp3));
-  DONE;
-}")
-
-(define_expand "reduc_umax_v4si"
-  [(set (match_operand:V4SI 0 "register_operand" "=v")
-        (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "v")] 217))]
-  "TARGET_ALTIVEC"
-  "
-{ 
-  rtx vtmp1 = gen_reg_rtx (V4SImode);
-  rtx vtmp2 = gen_reg_rtx (V4SImode);
-  rtx vtmp3 = gen_reg_rtx (V4SImode);
-
-  emit_insn (gen_altivec_vsldoi_v4si (vtmp1, operands[1], operands[1], 
-				      gen_rtx_CONST_INT (SImode, 8)));
-  emit_insn (gen_umaxv4si3 (vtmp2, operands[1], vtmp1));
-  emit_insn (gen_altivec_vsldoi_v4si (vtmp3, vtmp2, vtmp2, 
-				      gen_rtx_CONST_INT (SImode, 4)));
-  emit_insn (gen_umaxv4si3 (operands[0], vtmp2, vtmp3));
-  DONE;
-}")
-
-(define_expand "reduc_smin_v4si"
-  [(set (match_operand:V4SI 0 "register_operand" "=v")
-        (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "v")] 217))]
-  "TARGET_ALTIVEC"
-  "
-{ 
-  rtx vtmp1 = gen_reg_rtx (V4SImode);
-  rtx vtmp2 = gen_reg_rtx (V4SImode);
-  rtx vtmp3 = gen_reg_rtx (V4SImode);
-
-  emit_insn (gen_altivec_vsldoi_v4si (vtmp1, operands[1], operands[1], 
-				      gen_rtx_CONST_INT (SImode, 8)));
-  emit_insn (gen_sminv4si3 (vtmp2, operands[1], vtmp1));
-  emit_insn (gen_altivec_vsldoi_v4si (vtmp3, vtmp2, vtmp2, 
-				      gen_rtx_CONST_INT (SImode, 4)));
-  emit_insn (gen_sminv4si3 (operands[0], vtmp2, vtmp3));
-  DONE;
-}")
-
-(define_expand "reduc_smin_v4sf"
-  [(set (match_operand:V4SF 0 "register_operand" "=v")
-        (unspec:V4SF [(match_operand:V4SF 1 "register_operand" "v")] 217))]
+;; Vector shift left in bits. Currently supported ony for shift
+;; amounts that can be expressed as byte shifts (divisible by 8).
+;; General shift amounts can be supported using vslo + vsl. We're
+;; not expecting to see these yet (the vectorizer currently
+;; generates only shifts divisible by byte_size).
+(define_expand "vec_shl_<mode>"
+  [(set (match_operand:V 0 "register_operand" "=v")
+        (unspec:V [(match_operand:V 1 "register_operand" "v")
+                   (match_operand:QI 2 "reg_or_short_operand" "")] 219 ))]
  "TARGET_ALTIVEC"
  "
 {
-  rtx vtmp1 = gen_reg_rtx (V4SFmode);
-  rtx vtmp2 = gen_reg_rtx (V4SFmode);
-  rtx vtmp3 = gen_reg_rtx (V4SFmode);
+  rtx bitshift = operands[2];
+  rtx byteshift = gen_reg_rtx (QImode);
+  HOST_WIDE_INT bitshift_val;
+  HOST_WIDE_INT byteshift_val;

-  emit_insn (gen_altivec_vsldoi_v4sf (vtmp1, operands[1], operands[1], 
-				      gen_rtx_CONST_INT (SImode, 8)));
-  emit_insn (gen_sminv4sf3 (vtmp2, operands[1], vtmp1));
-  emit_insn (gen_altivec_vsldoi_v4sf (vtmp3, vtmp2, vtmp2, 
-				      gen_rtx_CONST_INT (SImode, 4)));
-  emit_insn (gen_sminv4sf3 (operands[0], vtmp2, vtmp3));
+  if (! CONSTANT_P (bitshift))
+    FAIL;
+  bitshift_val = INTVAL (bitshift);
+  if (bitshift_val & 0x7)
+    FAIL;
+  byteshift_val = bitshift_val >> 3;
+  byteshift = gen_rtx_CONST_INT (QImode, byteshift_val);
+  emit_insn (gen_altivec_vsldoi_<mode> (operands[0], operands[1], operands[1],
+                                        byteshift));
  DONE;
 }")

-(define_expand "reduc_umin_v4si"
-  [(set (match_operand:V4SI 0 "register_operand" "=v")
-        (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "v")] 217))]
+;; Vector shift left in bits. Currently supported ony for shift
+;; amounts that can be expressed as byte shifts (divisible by 8).
+;; General shift amounts can be supported using vsro + vsr. We're
+;; not expecting to see these yet (the vectorizer currently
+;; generates only shifts divisible by byte_size).
+(define_expand "vec_shr_<mode>"
+  [(set (match_operand:V 0 "register_operand" "=v")
+        (unspec:V [(match_operand:V 1 "register_operand" "v")
+                   (match_operand:QI 2 "reg_or_short_operand" "")] 219 ))]
  "TARGET_ALTIVEC"
  "
 {
-  rtx vtmp1 = gen_reg_rtx (V4SImode);
-  rtx vtmp2 = gen_reg_rtx (V4SImode);
-  rtx vtmp3 = gen_reg_rtx (V4SImode);
-
-  emit_insn (gen_altivec_vsldoi_v4si (vtmp1, operands[1], operands[1], 
-				      gen_rtx_CONST_INT (SImode, 8)));
-  emit_insn (gen_uminv4si3 (vtmp2, operands[1], vtmp1));
-  emit_insn (gen_altivec_vsldoi_v4si (vtmp3, vtmp2, vtmp2, 
-				      gen_rtx_CONST_INT (SImode, 4)));
-  emit_insn (gen_uminv4si3 (operands[0], vtmp2, vtmp3));
+  rtx bitshift = operands[2];
+  rtx byteshift = gen_reg_rtx (QImode);
+  HOST_WIDE_INT bitshift_val;
+  HOST_WIDE_INT byteshift_val;
+ 
+  if (! CONSTANT_P (bitshift))
+    FAIL;
+  bitshift_val = INTVAL (bitshift);
+  if (bitshift_val & 0x7)
+    FAIL;
+  byteshift_val = 16 - (bitshift_val >> 3);
+  byteshift = gen_rtx_CONST_INT (QImode, byteshift_val);
+  emit_insn (gen_altivec_vsldoi_<mode> (operands[0], operands[1], operands[1],
+                                        byteshift));
  DONE;
 }")

-(define_expand "reduc_plus_v4si"
-  [(set (match_operand:V4SI 0 "register_operand" "=v")
-        (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "v")] 217))]
+(define_insn "altivec_vsumsws_nomode"
+  [(set (match_operand 0 "register_operand" "=v")
+        (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "v")
+                      (match_operand:V4SI 2 "register_operand" "v")] 135))
+   (set (reg:SI 110) (unspec:SI [(const_int 0)] UNSPEC_SET_VSCR))]
+  "TARGET_ALTIVEC"
+  "vsumsws %0,%1,%2"
+  [(set_attr "type" "veccomplex")])
+
+(define_expand "reduc_splus_<mode>"
+  [(set (match_operand:VIshort 0 "register_operand" "=v")
+        (unspec:VIshort [(match_operand:VIshort 1 "register_operand" "v")] 217))]
  "TARGET_ALTIVEC"
  "
 { 
+  rtx vzero = gen_reg_rtx (V4SImode);
  rtx vtmp1 = gen_reg_rtx (V4SImode);
-  rtx vtmp2 = gen_reg_rtx (V4SImode);
-  rtx vtmp3 = gen_reg_rtx (V4SImode);

-  emit_insn (gen_altivec_vsldoi_v4si (vtmp1, operands[1], operands[1], 
-				      gen_rtx_CONST_INT (SImode, 8)));
-  emit_insn (gen_addv4si3 (vtmp2, operands[1], vtmp1));
-  emit_insn (gen_altivec_vsldoi_v4si (vtmp3, vtmp2, vtmp2, 
-				      gen_rtx_CONST_INT (SImode, 4)));
-  emit_insn (gen_addv4si3 (operands[0], vtmp2, vtmp3));
+  emit_insn (gen_altivec_vspltisw (vzero, const0_rtx));
+  emit_insn (gen_altivec_vsum4s<VI_char>s (vtmp1, operands[1], vzero));
+  emit_insn (gen_altivec_vsumsws_nomode (operands[0], vtmp1, vzero));
  DONE;
 }")
-  
-(define_expand "reduc_plus_v4sf"
-  [(set (match_operand:V4SF 0 "register_operand" "=v")
-        (unspec:V4SF [(match_operand:V4SF 1 "register_operand" "v")] 217))]
+
+(define_expand "reduc_uplus_v16qi"
+  [(set (match_operand:V16QI 0 "register_operand" "=v")
+        (unspec:V16QI [(match_operand:V16QI 1 "register_operand" "v")] 217))]
  "TARGET_ALTIVEC"
  "
-{ 
-  rtx vtmp1 = gen_reg_rtx (V4SFmode);
-  rtx vtmp2 = gen_reg_rtx (V4SFmode);
-  rtx vtmp3 = gen_reg_rtx (V4SFmode);
+{
+  rtx vzero = gen_reg_rtx (V4SImode);
+  rtx vtmp1 = gen_reg_rtx (V4SImode);

-  emit_insn (gen_altivec_vsldoi_v4sf (vtmp1, operands[1], operands[1], 
-				      gen_rtx_CONST_INT (SImode, 8)));
-  emit_insn (gen_addv4sf3 (vtmp2, operands[1], vtmp1));
-  emit_insn (gen_altivec_vsldoi_v4sf (vtmp3, vtmp2, vtmp2, 
-				      gen_rtx_CONST_INT (SImode, 4)));
-  emit_insn (gen_addv4sf3 (operands[0], vtmp2, vtmp3));
+  emit_insn (gen_altivec_vspltisw (vzero, const0_rtx));
+  emit_insn (gen_altivec_vsum4ubs (vtmp1, operands[1], vzero));
+  emit_insn (gen_altivec_vsumsws_nomode (operands[0], vtmp1, vzero));
  DONE;
 }")

--- a/gcc/expr.c
+++ b/gcc/expr.c
@ -8367,6 +8367,13 @@ expand_expr_real_1 (tree exp, rtx target, enum machine_mode tmode,
        return temp;
      }

+    case VEC_LSHIFT_EXPR:
+    case VEC_RSHIFT_EXPR:
+      {
+	target = expand_vec_shift_expr (exp, target);
+	return target;
+      }
+
    default:
      return lang_hooks.expand_expr (exp, original_target, tmode,
 				     modifier, alt_rtl);
--- a/gcc/genopinit.c
+++ b/gcc/genopinit.c
@ -196,6 +196,8 @@ static const char * const optabs[] =
  "vec_set_optab->handlers[$A].insn_code = CODE_FOR_$(vec_set$a$)",
  "vec_extract_optab->handlers[$A].insn_code = CODE_FOR_$(vec_extract$a$)",
  "vec_init_optab->handlers[$A].insn_code = CODE_FOR_$(vec_init$a$)",
+  "vec_shl_optab->handlers[$A].insn_code = CODE_FOR_$(vec_shl_$a$)",
+  "vec_shr_optab->handlers[$A].insn_code = CODE_FOR_$(vec_shr_$a$)",
  "vec_realign_load_optab->handlers[$A].insn_code = CODE_FOR_$(vec_realign_load_$a$)",
  "vcond_gen_code[$A] = CODE_FOR_$(vcond$a$)",
  "vcondu_gen_code[$A] = CODE_FOR_$(vcondu$a$)",
@ -203,7 +205,8 @@ static const char * const optabs[] =
  "reduc_umax_optab->handlers[$A].insn_code = CODE_FOR_$(reduc_umax_$a$)",
  "reduc_smin_optab->handlers[$A].insn_code = CODE_FOR_$(reduc_smin_$a$)",
  "reduc_umin_optab->handlers[$A].insn_code = CODE_FOR_$(reduc_umin_$a$)",
-  "reduc_plus_optab->handlers[$A].insn_code = CODE_FOR_$(reduc_plus_$a$)" 
+  "reduc_splus_optab->handlers[$A].insn_code = CODE_FOR_$(reduc_splus_$a$)" ,
+  "reduc_uplus_optab->handlers[$A].insn_code = CODE_FOR_$(reduc_uplus_$a$)" 
 };

 static void gen_insn (rtx);
--- a/gcc/optabs.c
+++ b/gcc/optabs.c
@ -301,7 +301,13 @@ optab_for_tree_code (enum tree_code code, tree type)
      return TYPE_UNSIGNED (type) ? reduc_umin_optab : reduc_smin_optab;

    case REDUC_PLUS_EXPR:
-      return reduc_plus_optab;
+      return TYPE_UNSIGNED (type) ? reduc_uplus_optab : reduc_splus_optab;
+
+    case VEC_LSHIFT_EXPR:
+      return vec_shl_optab;
+
+    case VEC_RSHIFT_EXPR:
+      return vec_shr_optab;

    default:
      break;
@ -443,6 +449,61 @@ force_expand_binop (enum machine_mode mode, optab binoptab,
  return true;
 }

+/* Generate insns for VEC_LSHIFT_EXPR, VEC_RSHIFT_EXPR.  */
+
+rtx
+expand_vec_shift_expr (tree vec_shift_expr, rtx target)
+{
+  enum insn_code icode;
+  rtx rtx_op1, rtx_op2;
+  enum machine_mode mode1;
+  enum machine_mode mode2;
+  enum machine_mode mode = TYPE_MODE (TREE_TYPE (vec_shift_expr));
+  tree vec_oprnd = TREE_OPERAND (vec_shift_expr, 0);
+  tree shift_oprnd = TREE_OPERAND (vec_shift_expr, 1);
+  optab shift_optab;
+  rtx pat;
+
+  switch (TREE_CODE (vec_shift_expr))
+    {
+      case VEC_RSHIFT_EXPR:
+	shift_optab = vec_shr_optab;
+	break;
+      case VEC_LSHIFT_EXPR:
+	shift_optab = vec_shl_optab;
+	break;
+      default:
+	gcc_unreachable ();
+    }
+
+  icode = (int) shift_optab->handlers[(int) mode].insn_code;
+  gcc_assert (icode != CODE_FOR_nothing);
+
+  mode1 = insn_data[icode].operand[1].mode;
+  mode2 = insn_data[icode].operand[2].mode;
+
+  rtx_op1 = expand_expr (vec_oprnd, NULL_RTX, VOIDmode, EXPAND_NORMAL);
+  if (!(*insn_data[icode].operand[1].predicate) (rtx_op1, mode1)
+      && mode1 != VOIDmode)
+    rtx_op1 = force_reg (mode1, rtx_op1);
+
+  rtx_op2 = expand_expr (shift_oprnd, NULL_RTX, VOIDmode, EXPAND_NORMAL);
+  if (!(*insn_data[icode].operand[2].predicate) (rtx_op2, mode2)
+      && mode2 != VOIDmode)
+    rtx_op2 = force_reg (mode2, rtx_op2);
+
+  if (!target
+      || ! (*insn_data[icode].operand[0].predicate) (target, mode))
+    target = gen_reg_rtx (mode);
+
+  /* Emit instruction */
+  pat = GEN_FCN (icode) (target, rtx_op1, rtx_op2);
+  gcc_assert (pat);
+  emit_insn (pat);
+
+  return target;
+}
+
 /* This subroutine of expand_doubleword_shift handles the cases in which
   the effective shift value is >= BITS_PER_WORD.  The arguments and return
   value are the same as for the parent routine, except that SUPERWORD_OP1
@ -5074,11 +5135,14 @@ init_optabs (void)
  reduc_umax_optab = init_optab (UNKNOWN);
  reduc_smin_optab = init_optab (UNKNOWN);
  reduc_umin_optab = init_optab (UNKNOWN);
-  reduc_plus_optab = init_optab (UNKNOWN);
+  reduc_splus_optab = init_optab (UNKNOWN);
+  reduc_uplus_optab = init_optab (UNKNOWN);

  vec_extract_optab = init_optab (UNKNOWN);
  vec_set_optab = init_optab (UNKNOWN);
  vec_init_optab = init_optab (UNKNOWN);
+  vec_shl_optab = init_optab (UNKNOWN);
+  vec_shr_optab = init_optab (UNKNOWN);
  vec_realign_load_optab = init_optab (UNKNOWN);
  movmisalign_optab = init_optab (UNKNOWN);

--- a/gcc/optabs.h
+++ b/gcc/optabs.h
@ -236,7 +236,8 @@ enum optab_index
  OTI_reduc_umax,
  OTI_reduc_smin,
  OTI_reduc_umin,
-  OTI_reduc_plus,
+  OTI_reduc_splus,
+  OTI_reduc_uplus,

  /* Set specified field of vector operand.  */
  OTI_vec_set,
@ -244,6 +245,9 @@ enum optab_index
  OTI_vec_extract,
  /* Initialize vector operand.  */
  OTI_vec_init,
+  /* Whole vector shift. The shift amount is in bits.  */
+  OTI_vec_shl,
+  OTI_vec_shr,
  /* Extract specified elements from vectors, for vector load.  */
  OTI_vec_realign_load,

@ -358,11 +362,14 @@ extern GTY(()) optab optab_table[OTI_MAX];
 #define reduc_umax_optab (optab_table[OTI_reduc_umax])
 #define reduc_smin_optab (optab_table[OTI_reduc_smin])
 #define reduc_umin_optab (optab_table[OTI_reduc_umin])
-#define reduc_plus_optab (optab_table[OTI_reduc_plus])
+#define reduc_splus_optab (optab_table[OTI_reduc_splus])
+#define reduc_uplus_optab (optab_table[OTI_reduc_uplus])

 #define vec_set_optab (optab_table[OTI_vec_set])
 #define vec_extract_optab (optab_table[OTI_vec_extract])
 #define vec_init_optab (optab_table[OTI_vec_init])
+#define vec_shl_optab (optab_table[OTI_vec_shl])
+#define vec_shr_optab (optab_table[OTI_vec_shr])
 #define vec_realign_load_optab (optab_table[OTI_vec_realign_load])

 #define powi_optab (optab_table[OTI_powi])
@ -575,4 +582,7 @@ bool expand_vec_cond_expr_p (tree, enum machine_mode);
 /* Generate code for VEC_COND_EXPR.  */
 extern rtx expand_vec_cond_expr (tree, rtx);

+/* Generate code for VEC_LSHIFT_EXPR and VEC_RSHIFT_EXPR.  */
+extern rtx expand_vec_shift_expr (tree, rtx);
+
 #endif /* GCC_OPTABS_H */
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@ -1,3 +1,21 @@
+2005-06-21  Dorit Nuzman  <dorit@il.ibm.com>
+
+	* lib/target-supports.exp (check_effective_target_vect_reduction): 
+	Remove.
+	* gcc.dg/vect/vect.exp: Run tests with additional flags separately.
+	* gcc.dg/vect/vect-reduc-1.c: Vectorizable on all relevant platforms -
+	remove vect_reduction target keyword. Also avoid two returns in main.
+	* gcc.dg/vect/vect-reduc-3.c: Likewise.
+	* gcc.dg/vect/vect-reduc-2.c: Likewise. Also initialize diff to 0.
+	* gcc.dg/vect/vect-reduc-1short.c: New test.
+	* gcc.dg/vect/vect-reduc-1char.c: New test.
+	* gcc.dg/vect/vect-reduc-2short.c: New test.
+	* gcc.dg/vect/vect-reduc-2char.c: New test.
+	* gcc.dg/vect/vect-reduc-6.c: New test.
+	* gcc.dg/vect/trapv-vect-reduc-4.c: New test.
+	* gcc.dg/vect/fast-math-vect-reduc-5.c: New test.
+	* gcc.dg/vect/fast-math-vect-reduc-7.c: New test
+
 2005-06-21  Tobias Schl"uter  <tobias.schlueter@physik.uni-muenchen.de>
 	Paul Thomas  <pault@gcc.gnu.org>

--- a/gcc/testsuite/gcc.dg/vect/fast-math-vect-reduc-5.c
+++ b/gcc/testsuite/gcc.dg/vect/fast-math-vect-reduc-5.c
@ -0,0 +1,53 @@
+/* { dg-require-effective-target vect_float } */
+
+/* need -funsafe-math-optimizations to vectorize the summation.
+   also need -ffinite-math-only to create the min/max expr.  */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 16
+#define DIFF 242
+
+int main1 (float x, float max_result)
+{
+  int i;
+  float b[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45};
+  float c[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
+  float diff = 2;
+  float max = x;
+  float min = 10;
+
+  for (i = 0; i < N; i++) {
+    diff += (b[i] - c[i]);
+  }
+
+  for (i = 0; i < N; i++) {
+    max = max < c[i] ? c[i] : max;
+  }
+
+  for (i = 0; i < N; i++) {
+    min = min > c[i] ? c[i] : min;
+  }
+
+  /* check results:  */
+  if (diff != DIFF)
+    abort ();
+  if (max != max_result)
+    abort ();
+  if (min != 0)
+    abort ();
+
+  return 0;
+}
+
+int main (void)
+{ 
+  check_vect ();
+  
+  main1 (100, 100);
+  main1 (0, 15);
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" } } */
--- a/gcc/testsuite/gcc.dg/vect/fast-math-vect-reduc-7.c
+++ b/gcc/testsuite/gcc.dg/vect/fast-math-vect-reduc-7.c
@ -0,0 +1,53 @@
+/* { dg-require-effective-target vect_double } */
+
+/* need -funsafe-math-optimizations to vectorize the summation.
+   also need -ffinite-math-only to create the min/max expr.  */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 16
+#define DIFF 242
+
+int main1 (double x, double max_result)
+{
+  int i;
+  double b[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45};
+  double c[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
+  double diff = 2;
+  double max = x;
+  double min = 10;
+
+  for (i = 0; i < N; i++) {
+    diff += (b[i] - c[i]);
+  }
+
+  for (i = 0; i < N; i++) {
+    max = max < c[i] ? c[i] : max;
+  }
+
+  for (i = 0; i < N; i++) {
+    min = min > c[i] ? c[i] : min;
+  }
+
+  /* check results:  */
+  if (diff != DIFF)
+    abort ();
+  if (max != max_result)
+    abort ();
+  if (min != 0)
+    abort ();
+
+  return 0;
+}
+
+int main (void)
+{ 
+  check_vect ();
+  
+  main1 (100, 100);
+  main1 (0, 15);
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" } } */
--- a/gcc/testsuite/gcc.dg/vect/trapv-vect-reduc-4.c
+++ b/gcc/testsuite/gcc.dg/vect/trapv-vect-reduc-4.c
@ -0,0 +1,49 @@
+/* { dg-require-effective-target vect_int } */
+/* { dg-do compile } */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 16
+#define DIFF 242
+
+int main1 (int x, int max_result)
+{
+  int i;
+  int b[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45};
+  int c[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
+  int diff = 2;
+  int max = x;
+  int min = 10;
+
+  for (i = 0; i < N; i++) {
+    diff += (b[i] - c[i]);
+  }
+
+  for (i = 0; i < N; i++) {
+    max = max < c[i] ? c[i] : max;
+  }
+
+  for (i = 0; i < N; i++) {
+    min = min > c[i] ? c[i] : min;
+  }
+
+  /* check results:  */
+  if (diff != DIFF)
+    abort ();
+  if (max != max_result)
+    abort ();
+  if (min != 0)
+    abort ();
+}
+
+int main (void)
+{ 
+  check_vect ();
+  
+  main1 (100, 100);
+  main1 (0, 15);
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" } } */
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-1.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-1.c
@ -47,9 +47,9 @@ int main (void)
 { 
  check_vect ();
  
-  return main1 (100, 100);
-  return main1 (0, 15);
+  main1 (100, 100);
+  main1 (0, 15);
 }

-/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { xfail {! vect_reduction} } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { xfail i?86-*-* x86_64-*-* } } } */
 /* { dg-final { cleanup-tree-dump "vect" } } */
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-1char.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-1char.c
@ -0,0 +1,51 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 16
+#define DIFF 242
+
+int main1 (unsigned char x, unsigned char max_result)
+{
+  int i;
+  unsigned char ub[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45};
+  unsigned char uc[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
+  unsigned char udiff = 2;
+  unsigned char umax = x;
+  unsigned char umin = 10;
+
+  for (i = 0; i < N; i++) {
+    udiff += (unsigned char)(ub[i] - uc[i]);
+  }
+
+  for (i = 0; i < N; i++) {
+    umax = umax < uc[i] ? uc[i] : umax;
+  }
+
+  for (i = 0; i < N; i++) {
+    umin = umin > uc[i] ? uc[i] : umin;
+  }
+
+  /* check results:  */
+  if (udiff != DIFF)
+    abort ();
+  if (umax != max_result)
+    abort ();
+  if (umin != 0)
+    abort ();
+
+  return 0;
+}
+
+int main (void)
+{ 
+  check_vect ();
+  
+  main1 (100, 100);
+  main1 (0, 15);
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-1short.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-1short.c
@ -0,0 +1,51 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 16
+#define DIFF 242
+
+int main1 (unsigned short x, unsigned short max_result)
+{
+  int i;
+  unsigned short ub[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45};
+  unsigned short uc[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
+  unsigned short  udiff = 2;
+  unsigned short umax = x;
+  unsigned short umin = 10;
+
+  for (i = 0; i < N; i++) {
+    udiff += (unsigned short)(ub[i] - uc[i]);
+  }
+
+  for (i = 0; i < N; i++) {
+    umax = umax < uc[i] ? uc[i] : umax;
+  }
+
+  for (i = 0; i < N; i++) {
+    umin = umin > uc[i] ? uc[i] : umin;
+  }
+
+  /* check results:  */
+  if (udiff != DIFF)
+    abort ();
+  if (umax != max_result)
+    abort ();
+  if (umin != 0)
+    abort ();
+
+  return 0;
+}
+
+int main (void)
+{ 
+  check_vect ();
+  
+  main1 (100, 100);
+  main1 (0, 15);
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { xfail i?86-*-* x86_64-*-* } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-2.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-2.c
@ -1,11 +1,10 @@
-
 /* { dg-require-effective-target vect_int } */

 #include <stdarg.h>
 #include "tree-vect.h"

 #define N 16
-#define DIFF 242
+#define DIFF 240

 /* Test vectorization of reduction of signed-int.  */

@ -14,7 +13,7 @@ int main1 (int x, int max_result)
  int i;
  int b[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45};
  int c[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
-  int diff = 2;
+  int diff = 0;
  int max = x;
  int min = 10;

@ -45,9 +44,10 @@ int main (void)
 { 
  check_vect ();
  
-  return main1 (100, 100);
-  return main1 (0, 15);
+  main1 (100, 100);
+  main1 (0, 15);
+  return 0;
 }

-/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { xfail {! vect_reduction} } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { xfail i?86-*-* x86_64-*-* } } } */
 /* { dg-final { cleanup-tree-dump "vect" } } */
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-2char.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-2char.c
@ -0,0 +1,51 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 16
+#define DIFF 121
+
+int main1 (char x, char max_result)
+{
+  int i;
+  char b[N] = {0,2,3,6,8,10,12,14,16,18,20,22,24,26,28,30};
+  char c[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
+  signed char diff = 2;
+  char max = x;
+  char min = 10;
+
+  for (i = 0; i < N; i++) {
+    diff += (b[i] - c[i]);
+  }
+
+  for (i = 0; i < N; i++) {
+    max = max < c[i] ? c[i] : max;
+  }
+
+  for (i = 0; i < N; i++) {
+    min = min > c[i] ? c[i] : min;
+  }
+
+  /* check results:  */
+  if (diff != DIFF)
+    abort ();
+  if (max != max_result)
+    abort ();
+  if (min != 0)
+    abort ();
+
+  return 0;
+}
+
+int main (void)
+{ 
+  check_vect ();
+  
+  main1 (100, 100);
+  main1 (0, 15);
+  return 0 ;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { xfail i?86-*-* x86_64-*-* } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-2short.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-2short.c
@ -0,0 +1,51 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 16
+#define DIFF 242
+
+int main1 (short x, short max_result)
+{
+  int i;
+  short b[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45};
+  short c[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
+  short diff = 2;
+  short max = x;
+  short min = 10;
+
+  for (i = 0; i < N; i++) {
+    diff += (b[i] - c[i]);
+  }
+  for (i = 0; i < N; i++) {
+    max = max < c[i] ? c[i] : max;
+  }
+
+  for (i = 0; i < N; i++) {
+    min = min > c[i] ? c[i] : min;
+  }
+
+  /* check results:  */
+  if (diff != DIFF)
+    abort ();
+  if (max != max_result)
+    abort ();
+  if (min != 0)
+    abort ();
+
+  return 0;
+}
+
+int main (void)
+{ 
+  check_vect ();
+  
+  main1 (100, 100);
+  main1 (0, 15);
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-3.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-3.c
@ -4,12 +4,11 @@
 #include "tree-vect.h"

 #define N 16
-#define DIFF 240

 /* Test vectorization of reduction of unsigned-int in the presence
   of unknown-loop-bound.  */

-int main1 (int n)
+int main1 (int n, int res)
 {
  int i;
  unsigned int ub[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45};
@ -22,7 +21,7 @@ int main1 (int n)
  }

  /* check results:  */
-  if (udiff != DIFF)
+  if (udiff != res)
    abort ();

  return 0;
@ -32,9 +31,10 @@ int main (void)
 { 
  check_vect ();
  
-  return main1 (N);
-  return main1 (N-1);
+  main1 (N, 240);
+  main1 (N-1, 210);
+  return 0;
 }

-/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail {! vect_reduction} } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
 /* { dg-final { cleanup-tree-dump "vect" } } */
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-6.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-6.c
@ -0,0 +1,51 @@
+/* { dg-require-effective-target vect_float } */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 16
+#define DIFF 242
+
+int main1 (float x, float max_result)
+{
+  int i;
+  float b[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45};
+  float c[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
+  float diff = 2;
+  float max = x;
+  float min = 10;
+
+  for (i = 0; i < N; i++) {
+    diff += (b[i] - c[i]);
+  }
+
+  for (i = 0; i < N; i++) {
+    max = max < c[i] ? c[i] : max;
+  }
+
+  for (i = 0; i < N; i++) {
+    min = min > c[i] ? c[i] : min;
+  }
+
+  /* check results:  */
+  if (diff != DIFF)
+    abort ();
+  if (max != max_result)
+    abort ();
+  if (min != 0)
+    abort ();
+
+  return 0;
+}
+
+int main (void)
+{ 
+  check_vect ();
+  
+  main1 (100 ,100);
+  main1 (0, 15);
+  return 0;
+}
+
+/* need -ffast-math to vectorizer these loops.  */
+/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" } } */
--- a/gcc/testsuite/gcc.dg/vect/vect.exp
+++ b/gcc/testsuite/gcc.dg/vect/vect.exp
@ -76,7 +76,25 @@ if [istarget "powerpc*-*-*"] {
 dg-init

 # Main loop.
-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*.\[cS\]]]  \
+dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/pr*.\[cS\]]]  \
+	"" $DEFAULT_VECTCFLAGS
+dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/vect-*.\[cS\]]]  \
+	"" $DEFAULT_VECTCFLAGS
+
+#### Tests with special options
+global SAVED_DEFAULT_VECTCFLAGS
+set SAVED_DEFAULT_VECTCFLAGS $DEFAULT_VECTCFLAGS
+
+# -ffast-math tests
+set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS
+lappend DEFAULT_VECTCFLAGS "-ffast-math"
+dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/fast-math-vect*.\[cS\]]]  \
+	"" $DEFAULT_VECTCFLAGS
+
+# -ftrapv tests
+set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS
+lappend DEFAULT_VECTCFLAGS "-ftrapv"
+dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/trapv-vect*.\[cS\]]]  \
 	"" $DEFAULT_VECTCFLAGS

 # Clean up.
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@ -988,23 +988,6 @@ proc check_effective_target_vect_int_mult { } {
    return $et_vect_int_mult_saved
 }

-# Return 1 if the target supports vector reduction
-
-proc check_effective_target_vect_reduction { } {
-    global et_vect_reduction_saved
-
-    if [info exists et_vect_reduction_saved] { 
-        verbose "check_effective_target_vect_reduction: using cached result" 2
-    } else {
-        set et_vect_reduction_saved 0
-        if { [istarget powerpc*-*-*] } {
-            set et_vect_reduction_saved 1
-        }
-    }
-    verbose "check_effective_target_vect_reduction: returning $et_vect_reduction_saved" 2
-    return $et_vect_reduction_saved
-}   
-
 # Return 1 if the target supports atomic operations on "int" and "long".

 proc check_effective_target_sync_int_long { } {
--- a/gcc/tree-inline.c
+++ b/gcc/tree-inline.c
@ -1692,6 +1692,8 @@ estimate_num_insns_1 (tree *tp, int *walk_subtrees, void *data)
    case RSHIFT_EXPR:
    case LROTATE_EXPR:
    case RROTATE_EXPR:
+    case VEC_LSHIFT_EXPR:
+    case VEC_RSHIFT_EXPR:

    case BIT_IOR_EXPR:
    case BIT_XOR_EXPR:
--- a/gcc/tree-pretty-print.c
+++ b/gcc/tree-pretty-print.c
@ -1043,6 +1043,8 @@ dump_generic_node (pretty_printer *buffer, tree node, int spc, int flags,
    case RSHIFT_EXPR:
    case LROTATE_EXPR:
    case RROTATE_EXPR:
+    case VEC_LSHIFT_EXPR:
+    case VEC_RSHIFT_EXPR:
    case BIT_IOR_EXPR:
    case BIT_XOR_EXPR:
    case BIT_AND_EXPR:
@ -1838,6 +1840,8 @@ op_prio (tree op)
    case REDUC_MAX_EXPR:
    case REDUC_MIN_EXPR:
    case REDUC_PLUS_EXPR:
+    case VEC_LSHIFT_EXPR:
+    case VEC_RSHIFT_EXPR:
      return 16;

    case SAVE_EXPR:
@ -1925,6 +1929,12 @@ op_symbol (tree op)
    case RSHIFT_EXPR:
      return ">>";

+    case VEC_LSHIFT_EXPR:
+      return "v<<";
+
+    case VEC_RSHIFT_EXPR:
+      return "v>>";
+ 
    case PLUS_EXPR:
      return "+";

--- a/gcc/tree-vect-generic.c
+++ b/gcc/tree-vect-generic.c
@ -448,6 +448,7 @@ expand_vector_operations_1 (block_stmt_iterator *bsi)
 	compute_type = TREE_TYPE (type);
    }

+  gcc_assert (code != VEC_LSHIFT_EXPR && code != VEC_RSHIFT_EXPR);
  rhs = expand_vector_operation (bsi, type, compute_type, rhs, code);
  if (lang_hooks.types_compatible_p (TREE_TYPE (lhs), TREE_TYPE (rhs)))
    *p_rhs = rhs;
--- a/gcc/tree-vect-transform.c
+++ b/gcc/tree-vect-transform.c
@ -834,6 +834,7 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op,
 {
  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+  enum machine_mode mode = TYPE_MODE (vectype);
  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
  basic_block exit_bb;
@ -843,15 +844,18 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op,
  block_stmt_iterator exit_bsi;
  tree vec_dest;
  tree new_temp;
+  tree new_name;
  tree epilog_stmt;
  tree new_scalar_dest, exit_phi;
-  tree bitsize, bitpos; 
+  tree bitsize, bitpos, bytesize; 
  enum tree_code code = TREE_CODE (TREE_OPERAND (stmt, 1));
  tree scalar_initial_def;
  tree vec_initial_def;
  tree orig_name;
  imm_use_iterator imm_iter;
  use_operand_p use_p;
+  bool extract_scalar_result;
+  bool adjust_in_epilog;
  
  /*** 1. Create the reduction def-use cycle  ***/
  
@ -888,63 +892,214 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op,
  exit_bsi = bsi_start (exit_bb);


-  /* 2.2 Create:
-        v_out2 = reduc_expr <v_out1>
-        s_out3 = extract_field <v_out2, 0>  */
-
-  vec_dest = vect_create_destination_var (scalar_dest, vectype);
-  epilog_stmt = build2 (MODIFY_EXPR, vectype, vec_dest,
-                         build1 (reduc_code, vectype, PHI_RESULT (new_phi)));
-  new_temp = make_ssa_name (vec_dest, epilog_stmt);
-  TREE_OPERAND (epilog_stmt, 0) = new_temp;
-  bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
-
-  if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC))
-    {
-      fprintf (vect_dump, "transform reduction: created epilog code:");
-      print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM);
-    }
-
  new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
  bitsize = TYPE_SIZE (scalar_type);
+  bytesize = TYPE_SIZE_UNIT (scalar_type);

-  /* The result is in the low order bits.  */
-  if (BITS_BIG_ENDIAN)
-    bitpos = size_binop (MULT_EXPR,
-                       bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
-                       TYPE_SIZE (scalar_type));
+  /* 2.2 Create the reduction code.  */
+
+  if (reduc_code < NUM_TREE_CODES)
+    {
+      /*** Case 1:  Create:
+	   v_out2 = reduc_expr <v_out1>  */
+
+      if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC))
+	fprintf (vect_dump, "Reduce using direct vector reduction.");
+
+      vec_dest = vect_create_destination_var (scalar_dest, vectype);
+      epilog_stmt = build2 (MODIFY_EXPR, vectype, vec_dest,
+			build1 (reduc_code, vectype,  PHI_RESULT (new_phi)));
+      new_temp = make_ssa_name (vec_dest, epilog_stmt);
+      TREE_OPERAND (epilog_stmt, 0) = new_temp;
+      bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
+
+      extract_scalar_result = true;
+      adjust_in_epilog = true;
+    }
  else
-    bitpos = bitsize_zero_node;
+    {
+      enum tree_code shift_code;
+      bool have_whole_vector_shift = true;
+      enum tree_code code = TREE_CODE (TREE_OPERAND (stmt, 1)); /* CHECKME */
+      int bit_offset;
+      int element_bitsize = tree_low_cst (bitsize, 1);
+      int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
+      tree vec_temp;

-  epilog_stmt = build2 (MODIFY_EXPR, scalar_type, new_scalar_dest,
-                 build3 (BIT_FIELD_REF, scalar_type,
-                         new_temp, bitsize, bitpos));
-  new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
-  TREE_OPERAND (epilog_stmt, 0) = new_temp;
-  bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
+      /* The result of the reduction is expected to be at the LSB bits
+	 of the vector. For big-endian targets this means at the right
+	 end of the vector. For little-edian targets this means at the
+	 left end of the vector.  */

-  if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) 
-    print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM);
+      if (BITS_BIG_ENDIAN
+	  && vec_shr_optab->handlers[mode].insn_code != CODE_FOR_nothing)
+	shift_code = VEC_RSHIFT_EXPR;
+      else if (!BITS_BIG_ENDIAN
+	       && vec_shl_optab->handlers[mode].insn_code != CODE_FOR_nothing)
+	shift_code = VEC_LSHIFT_EXPR;
+      else
+	have_whole_vector_shift = false;

+      if (have_whole_vector_shift)
+        {
+	  /*** Case 2:
+	     for (offset = VS/2; offset >= element_size; offset/=2)
+	        {
+	          Create:  va' = vec_shift <va, offset>
+	          Create:  va = vop <va, va'>
+	        }  */
+
+	  if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC))
+	    fprintf (vect_dump, "Reduce using vector shifts");
+
+	  vec_dest = vect_create_destination_var (scalar_dest, vectype);
+	  new_temp = PHI_RESULT (new_phi);
+
+	  for (bit_offset = vec_size_in_bits/2;
+	       bit_offset >= element_bitsize;
+	       bit_offset /= 2)
+	    {
+	      tree bitpos = size_int (bit_offset);
+
+	      epilog_stmt = build2 (MODIFY_EXPR, vectype, vec_dest,
+	      build2 (shift_code, vectype, new_temp, bitpos));
+	      new_name = make_ssa_name (vec_dest, epilog_stmt);
+	      TREE_OPERAND (epilog_stmt, 0) = new_name;
+	      bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
+	      if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC))
+		print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM);
+
+
+	      epilog_stmt = build2 (MODIFY_EXPR, vectype, vec_dest,
+	      build2 (code, vectype, new_name, new_temp));
+	      new_temp = make_ssa_name (vec_dest, epilog_stmt);
+	      TREE_OPERAND (epilog_stmt, 0) = new_temp;
+	      bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
+	      if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC))
+		print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM);
+	    }
+
+	  extract_scalar_result = true;
+	  adjust_in_epilog = true;
+	}
+      else
+        {
+	  /*** Case 3:
+	     Create:  s = init; 
+	     for (offset=0; offset<vector_size; offset+=element_size;)
+	       {
+	         Create:  s' = extract_field <v_out2, offset>
+	         Create:  s = op <s, s'>
+	       }  */
+
+	  if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC))
+	    fprintf (vect_dump, "Reduce using scalar code. ");
+
+	  vec_temp = PHI_RESULT (new_phi);
+	  vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
+
+	  /* first iteration is peeled out when possible to minimize
+	     the number of operations we generate:  */
+	  if (code == PLUS_EXPR 
+	     && (integer_zerop (scalar_initial_def) 
+		 || real_zerop (scalar_initial_def)))
+	    {
+	      epilog_stmt = build2 (MODIFY_EXPR, scalar_type, new_scalar_dest,
+                                build3 (BIT_FIELD_REF, scalar_type,
+                                	vec_temp, bitsize, bitsize_zero_node));
+              new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
+              TREE_OPERAND (epilog_stmt, 0) = new_temp;
+              bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
+              if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC))
+                print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM);
+	      
+	      bit_offset = element_bitsize;
+	    }
+	  else
+	    {
+	      new_temp = scalar_initial_def;
+	      bit_offset = 0;
+	    }
+
+	  for (;
+	       bit_offset < vec_size_in_bits;
+	       bit_offset += element_bitsize)
+	    { 
+	      tree bitpos = bitsize_int (bit_offset);
+
+	      epilog_stmt = build2 (MODIFY_EXPR, scalar_type, new_scalar_dest,
+				build3 (BIT_FIELD_REF, scalar_type,
+					vec_temp, bitsize, bitpos));
+	      new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
+	      TREE_OPERAND (epilog_stmt, 0) = new_name;
+	      bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
+	      if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC))
+		print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM);
+
+
+	      epilog_stmt = build2 (MODIFY_EXPR, scalar_type, new_scalar_dest,
+				build2 (code, scalar_type, new_name, new_temp));
+	      new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
+	      TREE_OPERAND (epilog_stmt, 0) = new_temp;
+	      bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
+	      if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC))
+		print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM);
+	    }
+
+	  extract_scalar_result = false;
+	  adjust_in_epilog = false;
+	}
+    }
+
+
+  /* 2.3  Extract the final scalar result.  Create:
+         s_out3 = extract_field <v_out2, bitpos>  */
  
-  /* 2.3 Adjust the final result by the initial value of the reduction
-         variable. (when such adjustment is not needed, then
-         'scalar_initial_def' is zero).
+  if (extract_scalar_result)
+    {
+      if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC))
+	fprintf (vect_dump, "extract scalar result");

-         Create:
-         s_out = scalar_expr <s_out, scalar_initial_def>  */
+      /* The result is in the low order bits.  */
+      if (BITS_BIG_ENDIAN)
+	bitpos = size_binop (MULT_EXPR,
+		       bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
+		       TYPE_SIZE (scalar_type));
+      else
+	bitpos = bitsize_zero_node;

-  epilog_stmt = build2 (MODIFY_EXPR, scalar_type, new_scalar_dest,
-                  build2 (code, scalar_type, new_temp, scalar_initial_def));
-  new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
-  TREE_OPERAND (epilog_stmt, 0) = new_temp;
-  bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
+      epilog_stmt = build2 (MODIFY_EXPR, scalar_type, new_scalar_dest,
+		     build3 (BIT_FIELD_REF, scalar_type,
+			     new_temp, bitsize, bitpos));
+      new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
+      TREE_OPERAND (epilog_stmt, 0) = new_temp; 
+      bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
+      if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC))
+	print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM);
+    }

-  if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC))
-    print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM);

-    
-  /* 2.4 Replace uses of s_out0 with uses of s_out3  */ 
+  /* 2.4 Adjust the final result by the initial value of the reduction
+	 variable. (when such adjustment is not needed, then
+	 'scalar_initial_def' is zero).
+
+	 Create: 
+	 s_out = scalar_expr <s_out, scalar_initial_def>  */
+  
+  if (adjust_in_epilog)
+    {
+      epilog_stmt = build2 (MODIFY_EXPR, scalar_type, new_scalar_dest,
+                      build2 (code, scalar_type, new_temp, scalar_initial_def));
+      new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
+      TREE_OPERAND (epilog_stmt, 0) = new_temp;
+      bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
+
+      if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC))
+        print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM);
+    }
+
+
+  /* 2.5 Replace uses of s_out0 with uses of s_out3  */

  /* Find the loop-closed-use at the loop exit of the original
     scalar result.  (The reduction result is expected to have
@ -954,10 +1109,10 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op,
  FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
    {
      if (!flow_bb_inside_loop_p (loop, bb_for_stmt (USE_STMT (use_p))))
-        {
-          exit_phi = USE_STMT (use_p);
-          break;
-        }
+	{
+	  exit_phi = USE_STMT (use_p);
+	  break;
+	}
    }

  orig_name = PHI_RESULT (exit_phi);
@ -1067,13 +1222,13 @@ vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
    {
      if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC))
        fprintf (vect_dump, "no optab for reduction.");
-      return false;
+      reduc_code = NUM_TREE_CODES;
    }
  if (reduc_optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
    {
      if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC))
-        fprintf (vect_dump, "op not supported by target.");
-      return false;
+        fprintf (vect_dump, "reduc op not supported by target.");
+      reduc_code = NUM_TREE_CODES;
    }
 
  if (!vec_stmt) /* transformation not required.  */
--- a/gcc/tree.def
+++ b/gcc/tree.def
@ -957,6 +957,12 @@ DEFTREECODE (REDUC_MAX_EXPR, "reduc_max_expr", tcc_unary, 1)
 DEFTREECODE (REDUC_MIN_EXPR, "reduc_min_expr", tcc_unary, 1)
 DEFTREECODE (REDUC_PLUS_EXPR, "reduc_plus_expr", tcc_unary, 1)

+/* Whole vector lesft/right shift in bytes.
+   Operand 0 is a vector to be shifted.
+   Operand 1 is an integer shift amount in bits.  */
+DEFTREECODE (VEC_LSHIFT_EXPR, "vec_lshift_expr", tcc_binary, 2)
+DEFTREECODE (VEC_RSHIFT_EXPR, "vec_rshift_expr", tcc_binary, 2)
+
 /*
 Local variables:
 mode:c