* patch from Konstantinos Margaritis: bugfix in Altivec version of ei_pdiv

and various cleaning in Altivec code. Altivec vectorization have been re-enabled in CoreDeclaration * added copy constructors in non empty functors because I observed weird behavior with std::complex<>
2025-01-24 14:45:14 +08:00 · 2008-08-25 16:22:56 +00:00 · 2008-08-25 16:22:56 +00:00 · 8f9d30cb20
commit 8f9d30cb20
parent 7ce70e1437
4 changed files with 56 additions and 61 deletions
--- a/Eigen/CoreDeclarations
+++ b/Eigen/CoreDeclarations
@ -23,15 +23,12 @@
    #ifdef __SSSE3__
      #include <tmmintrin.h>
    #endif
-
-/*** Disable AltiVec code for now as it's out of date
  #elif (defined __ALTIVEC__)
    #define EIGEN_VECTORIZE
    #define EIGEN_VECTORIZE_ALTIVEC
    #include <altivec.h>
    // We _need_ to #undef bool as it's defined in <altivec.h> for some reason.
    #undef bool
-***/
  #endif
 #endif

--- a/Eigen/src/Array/Functors.h
+++ b/Eigen/src/Array/Functors.h
@ -36,6 +36,8 @@
 template<typename Scalar>
 struct ei_scalar_add_op {
  typedef typename ei_packet_traits<Scalar>::type PacketScalar;
+  // FIXME default copy constructors seems bugged with std::complex<>
+  inline ei_scalar_add_op(const ei_scalar_add_op& other) : m_other(other.m_other) { }
  inline ei_scalar_add_op(const Scalar& other) : m_other(other) { }
  inline Scalar operator() (const Scalar& a) const { return a + m_other; }
  inline const PacketScalar packetOp(const PacketScalar& a) const
@ -131,6 +133,8 @@ struct ei_functor_traits<ei_scalar_sin_op<Scalar> >
  */
 template<typename Scalar>
 struct ei_scalar_pow_op {
+  // FIXME default copy constructors seems bugged with std::complex<>
+  inline ei_scalar_pow_op(const ei_scalar_pow_op& other) : m_exponent(other.m_exponent) { }
  inline ei_scalar_pow_op(const Scalar& exponent) : m_exponent(exponent) {}
  inline Scalar operator() (const Scalar& a) const { return ei_pow(a, m_exponent); }
  const Scalar m_exponent;
--- a/Eigen/src/Core/Functors.h
+++ b/Eigen/src/Core/Functors.h
@ -143,8 +143,6 @@ struct ei_functor_traits<ei_scalar_quotient_op<Scalar> > {
    PacketAccess = ei_packet_traits<Scalar>::size>1
                  #if (defined EIGEN_VECTORIZE_SSE)
                  && NumTraits<Scalar>::HasFloatingPoint
-                  #elif (defined EIGEN_VECTORIZE_ALTIVEC)
-                  && 0
                  #endif
  };
 };
@ -260,6 +258,8 @@ struct ei_functor_traits<ei_scalar_real_op<Scalar> >
 template<typename Scalar>
 struct ei_scalar_multiple_op {
  typedef typename ei_packet_traits<Scalar>::type PacketScalar;
+  // FIXME default copy constructors seems bugged with std::complex<>
+  inline ei_scalar_multiple_op(const ei_scalar_multiple_op& other) : m_other(other.m_other) { }
  inline ei_scalar_multiple_op(const Scalar& other) : m_other(other) { }
  inline Scalar operator() (const Scalar& a) const { return a * m_other; }
  inline const PacketScalar packetOp(const PacketScalar& a) const
@ -273,6 +273,8 @@ struct ei_functor_traits<ei_scalar_multiple_op<Scalar> >
 template<typename Scalar, bool HasFloatingPoint>
 struct ei_scalar_quotient1_impl {
  typedef typename ei_packet_traits<Scalar>::type PacketScalar;
+  // FIXME default copy constructors seems bugged with std::complex<>
+  inline ei_scalar_quotient1_impl(const ei_scalar_quotient1_impl& other) : m_other(other.m_other) { }
  inline ei_scalar_quotient1_impl(const Scalar& other) : m_other(static_cast<Scalar>(1) / other) {}
  inline Scalar operator() (const Scalar& a) const { return a * m_other; }
  inline const PacketScalar packetOp(const PacketScalar& a) const
@ -285,10 +287,11 @@ struct ei_functor_traits<ei_scalar_quotient1_impl<Scalar,true> >

 template<typename Scalar>
 struct ei_scalar_quotient1_impl<Scalar,false> {
+  // FIXME default copy constructors seems bugged with std::complex<>
+  inline ei_scalar_quotient1_impl(const ei_scalar_quotient1_impl& other) : m_other(other.m_other) { }
  inline ei_scalar_quotient1_impl(const Scalar& other) : m_other(other) {}
  inline Scalar operator() (const Scalar& a) const { return a / m_other; }
  const Scalar m_other;
-  enum { Cost = 2 * NumTraits<Scalar>::MulCost };
 };
 template<typename Scalar>
 struct ei_functor_traits<ei_scalar_quotient1_impl<Scalar,false> >
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@ -2,7 +2,6 @@
 // for linear algebra. Eigen itself is part of the KDE project.
 //
 // Copyright (C) 2008 Konstantinos Margaritis <markos@codex.gr>
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
 //
 // Eigen is free software; you can redistribute it and/or
 // modify it under the terms of the GNU Lesser General Public
@ -35,11 +34,16 @@ typedef vector int            v4i;
 typedef vector unsigned int   v4ui;
 typedef vector __bool int     v4bi;

-static const v4i   v0i   = vec_splat_s32(0);
-static const v4i   v1i   = vec_splat_s32(1);
-static const v4i   v16i_ = vec_splat_s32(-16);
-static const v4f   v0f   = (v4f) v0i;
-static const v4f   v1f   = (v4f) v1i;
+// We don't want to write the same code all the time, but we need to reuse the constants
+// and it doesn't really work to declare them global, so we define macros instead
+
+#define USE_CONST_v0i     const v4i   v0i   = vec_splat_s32(0)
+#define USE_CONST_v1i     const v4i   v1i   = vec_splat_s32(1)
+#define USE_CONST_v16i_   const v4i   v16i_ = vec_splat_s32(-16)
+#define USE_CONST_v0f     USE_CONST_v0i; const v4f v0f = (v4f) v0i;
+#define USE_CONST_v1f     USE_CONST_v1i; const v4f v1f = vec_ctf(v1i, 0);
+#define USE_CONST_v1i_    const v4ui  v1i_  = vec_splat_u32(-1);
+#define USE_CONST_v0f_    USE_CONST_v1i_; const v4f v0f_ = (v4f) vec_sl(v1i_, v1i_);

 template<> struct ei_packet_traits<float>  { typedef v4f type; enum {size=4}; };
 template<> struct ei_packet_traits<int>    { typedef v4i type; enum {size=4}; };
@ -54,7 +58,7 @@ std::ostream & operator <<(std::ostream & s, const v4f & v)
    float n[4];
  } vt;
  vt.v = v;
-  s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3] << std::endl;
+  s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
  return s;
 }

@ -65,7 +69,7 @@ std::ostream & operator <<(std::ostream & s, const v4i & v)
    int n[4];
  } vt;
  vt.v = v;
-  s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3] << std::endl;
+  s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
  return s;
 }

@ -76,7 +80,7 @@ std::ostream & operator <<(std::ostream & s, const v4ui & v)
    unsigned int n[4];
  } vt;
  vt.v = v;
-  s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3] << std::endl;
+  s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
  return s;
 }

@ -87,7 +91,7 @@ std::ostream & operator <<(std::ostream & s, const v4bi & v)
    unsigned int n[4];
  } vt;
  vt.v = v;
-  s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3] << std::endl;
+  s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
  return s;
 }

@ -97,71 +101,57 @@ template<> inline v4i  ei_padd(const v4i&   a, const v4i&   b) { return vec_add(
 template<> inline v4f  ei_psub(const v4f&   a, const v4f&   b) { return vec_sub(a,b); }
 template<> inline v4i  ei_psub(const v4i&   a, const v4i&   b) { return vec_sub(a,b); }

-template<> inline v4f  ei_pmul(const v4f&   a, const v4f&   b) { return vec_madd(a,b, v0f); }
+template<> inline v4f  ei_pmul(const v4f&   a, const v4f&   b) { USE_CONST_v0f; return vec_madd(a,b, v0f); }
 template<> inline v4i  ei_pmul(const v4i&   a, const v4i&   b)
 {
-  // Taken from http://developer.apple.com/hardwaredrivers/ve/algorithms.html#Multiply32
-  //Set up constants
-  v4i bswap, low_prod, high_prod, prod, prod_, a_, b_, a1, b1;
-  v4i v1_, v0_, v1sel;
-  v1_ = (v4i) vec_splat_u32(-1);
-  v0_ = (v4i) vec_sl((v4ui) v1_, (v4ui) v1_);
- 
-//  std::cout << "a: " << a << std::endl;
-//  std::cout << "b: " << b << std::endl;
-  a_ = vec_sub(v0i, a);
-  b_ = vec_sub(v0i, b);
-  a1  = vec_max(a, a_);
-  b1  = vec_max(b, b_);
-  v4bi sgn = (v4bi) vec_cmplt(vec_and(vec_xor(a, b), v0_), v0i);
+  // Detailed in: http://freevec.org/content/32bit_signed_integer_multiplication_altivec
+  //Set up constants, variables
+  v4i a1, b1, bswap, low_prod, high_prod, prod, prod_, v1sel;
+  USE_CONST_v0i;
+  USE_CONST_v1i;
+  USE_CONST_v16i_;

-//  std::cout << "a: " << a1 << std::endl;
-//  std::cout << "b: " << b1 << std::endl;
-//  std::cout << "a_: " << a_ << std::endl;
-//  std::cout << "b_: " << b_ << std::endl;
-//  std::cout << "sgn(a*b): " << std::hex << sgn << std::dec << std::endl;
+  // Get the absolute values 
+  a1  = vec_abs(a);
+  b1  = vec_abs(b);

-  // Do real work
+  // Get the signs using xor
+  v4bi sgn = (v4bi) vec_cmplt(vec_xor(a, b), v0i);
+
+  // Do the multiplication for the asbolute values.
  bswap = (v4i) vec_rl((v4ui) b1, (v4ui) v16i_ );
  low_prod = vec_mulo((vector short)a1, (vector short)b1);
  high_prod = vec_msum((vector short)a1, (vector short)bswap, v0i);
  high_prod = (v4i) vec_sl((v4ui) high_prod, (v4ui) v16i_);
  prod = vec_add( low_prod, high_prod );
-//  std::cout << "prod: " << prod << std::endl;
-//  std::cout << "v0i: " << v0i << std::endl;
+
+  // NOR the product and select only the negative elements according to the sign mask
  prod_ = vec_nor(prod, prod);
  prod_ = vec_sel(v0i, prod_, sgn);
+
+  // Add 1 to the result to get the negative numbers
  v1sel = vec_sel(v0i, v1i, sgn);
-//  std::cout << "prod_: " << prod_ << std::endl;
-//  std::cout << "v1sel: " << v1sel << std::endl;
  prod_ = vec_add(prod_, v1sel);
-//  std::cout << "prod_: " << prod_ << std::endl;
+
+  // Merge the results back to the final vector.
  prod = vec_sel(prod, prod_, sgn);
-//  std::cout << "final prod: " << prod << std::endl;
+
  return prod;
 }

 template<> inline v4f  ei_pdiv(const v4f&   a, const v4f&   b) {
-  std::cout << "a: " << a << std::endl;
-  std::cout << "b: " << b << std::endl;
+  v4f t, y_0, y_1, res;
+  USE_CONST_v0f;
+  USE_CONST_v1f;

  // Altivec does not offer a divide instruction, we have to do a reciprocal approximation
-  v4f y = vec_re(b);
-  std::cout << "1/b: " << y << std::endl;
+  y_0 = vec_re(b);
+  
+  // Do one Newton-Raphson iteration to get the needed accuracy
+  t = vec_nmsub(y_0, b, v1f);
+  y_1 = vec_madd(y_0, t, y_0);

-  // Set up some constants for inverse reciprocals
-  vector unsigned int v1_;
-  v4f v0_, t;
-  v1_ = vec_splat_u32(-1);
-  std::cout << "-1: " << (v4i) v1_ << std::endl;
-  v0_ = (v4f) vec_sl(v1_, v1_);
-  std::cout << "-0.0: " << v0_ << std::endl;
-  // Do a Newton-Raphson iteration to get the needed accuracy
-  t = vec_nmsub(y, b, v1f);
-  std::cout << "t: " << t << std::endl;
-  y = vec_nmsub(t, y, v0_);
-  v4f res = vec_madd(a, y, v0_);
-  std::cout << "res: " << res << std::endl;
+  res = vec_madd(a, y_1, v0f);
  return res;
 }

@ -184,7 +174,7 @@ template<> inline v4f  ei_ploadu(const float*  from)
  MSQ = vec_ld(0, (unsigned char *)from);          // most significant quadword
  LSQ = vec_ld(15, (unsigned char *)from);         // least significant quadword
  mask = vec_lvsl(0, from);                        // create the permute mask
-  return (v4f) vec_perm(MSQ, LSQ, mask);  // align the data
+  return (v4f) vec_perm(MSQ, LSQ, mask);           // align the data
 }

 template<> inline v4i    ei_ploadu(const int*    from)
@ -336,6 +326,7 @@ inline v4i  ei_preduxp(const v4i* vecs)

 inline int ei_predux(const v4i& a)
 {
+  USE_CONST_v0i;
  v4i sum;
  sum = vec_sums(a, v0i);
  sum = vec_sld(sum, v0i, 12);