add the vectorization of abs

2025-01-18 14:34:17 +08:00 · 2009-03-09 18:40:09 +00:00 · 2009-03-09 18:40:09 +00:00 · 3f80c68be5
commit 3f80c68be5
parent bd8107c90c
5 changed files with 52 additions and 12 deletions
--- a/Eigen/src/Core/Functors.h
+++ b/Eigen/src/Core/Functors.h
@ -203,13 +203,16 @@ struct ei_functor_traits<ei_scalar_opposite_op<Scalar> >
 template<typename Scalar> struct ei_scalar_abs_op EIGEN_EMPTY_STRUCT {
  typedef typename NumTraits<Scalar>::Real result_type;
  EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { return ei_abs(a); }
+  template<typename PacketScalar>
+  EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a) const
+  { return ei_pabs(a); }
 };
 template<typename Scalar>
 struct ei_functor_traits<ei_scalar_abs_op<Scalar> >
 {
  enum {
    Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = false // FIXME this could actually be vectorized with SSSE3.
+    PacketAccess = int(ei_packet_traits<Scalar>::size)>1
  };
 };

--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@ -64,6 +64,10 @@ template<typename Packet> inline Packet
 ei_pmax(const Packet& a,
        const Packet& b) { return std::max(a, b); }

+/** \internal \returns the absolute value of \a a */
+template<typename Packet> inline Packet
+ei_pabs(const Packet& a) { return ei_abs(a); }
+
 /** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */
 template<typename Scalar> inline typename ei_packet_traits<Scalar>::type
 ei_pload(const Scalar* from) { return *from; }
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@ -163,6 +163,9 @@ template<> inline v4i  ei_pmin(const v4i&   a, const v4i&   b) { return vec_min(
 template<> inline v4f  ei_pmax(const v4f&   a, const v4f&   b) { return vec_max(a,b); }
 template<> inline v4i  ei_pmax(const v4i&   a, const v4i&   b) { return vec_max(a,b); }

+template<> EIGEN_STRONG_INLINE v4f ei_pabs(const v4f& a) { return vec_abs(a); }
+template<> EIGEN_STRONG_INLINE v4i ei_pabs(const v4i& a) { return vec_abs(a); }
+
 template<> inline v4f  ei_pload(const float* from) { return vec_ld(0, from); }
 template<> inline v4i  ei_pload(const int*   from) { return vec_ld(0, from); }

--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@ -34,7 +34,7 @@

 #define ei_vec4i_swizzle1(v,p,q,r,s) \
  (_mm_shuffle_epi32( v, ((s)<<6|(r)<<4|(q)<<2|(p))))
-  
+
 #define ei_vec4f_swizzle2(a,b,p,q,r,s) \
  (_mm_shuffle_ps( (a), (b), ((s)<<6|(r)<<4|(q)<<2|(p))))

@ -146,6 +146,28 @@ template<> EIGEN_STRONG_INLINE __m128d ei_preverse(const __m128d& a)
 template<> EIGEN_STRONG_INLINE __m128i ei_preverse(const __m128i& a)
 { return _mm_shuffle_epi32(a,0x1B); }

+
+template<> EIGEN_STRONG_INLINE __m128 ei_pabs(const __m128& a)
+{
+  const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
+  return _mm_and_ps(a,mask);
+}
+template<> EIGEN_STRONG_INLINE __m128d ei_pabs(const __m128d& a)
+{
+  const __m128d mask = _mm_castsi128_pd(_mm_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
+  return _mm_and_pd(a,mask);
+}
+template<> EIGEN_STRONG_INLINE __m128i ei_pabs(const __m128i& a)
+{
+  #ifdef __SSSE3__
+  return _mm_abs_epi32(a);
+  #else
+  __m128i aux = _mm_srai_epi32(a,31);
+  return _mm_sub_epi32(_mm_xor_si128(a,aux),aux);
+  #endif
+}
+
+
 #ifdef __SSE3__
 // TODO implement SSE2 versions as well as integer versions
 template<> EIGEN_STRONG_INLINE __m128 ei_preduxp<__m128>(const __m128* vecs)
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@ -33,13 +33,20 @@ template<typename Scalar> bool areApprox(const Scalar* a, const Scalar* b, int s
  return true;
 }

-#define CHECK_CWISE(REFOP, POP) { \
+#define CHECK_CWISE2(REFOP, POP) { \
  for (int i=0; i<PacketSize; ++i) \
    ref[i] = REFOP(data1[i], data1[i+PacketSize]); \
  ei_pstore(data2, POP(ei_pload(data1), ei_pload(data1+PacketSize))); \
  VERIFY(areApprox(ref, data2, PacketSize) && #POP); \
 }

+#define CHECK_CWISE1(REFOP, POP) { \
+  for (int i=0; i<PacketSize; ++i) \
+    ref[i] = REFOP(data1[i]); \
+  ei_pstore(data2, POP(ei_pload(data1))); \
+  VERIFY(areApprox(ref, data2, PacketSize) && #POP); \
+}
+
 #define REF_ADD(a,b) ((a)+(b))
 #define REF_SUB(a,b) ((a)-(b))
 #define REF_MUL(a,b) ((a)*(b))
@ -103,15 +110,16 @@ template<typename Scalar> void packetmath()
    VERIFY(areApprox(ref, data2, PacketSize) && "ei_palign");
  }

-  CHECK_CWISE(REF_ADD,  ei_padd);
-  CHECK_CWISE(REF_SUB,  ei_psub);
-  CHECK_CWISE(REF_MUL,  ei_pmul);
+  CHECK_CWISE2(REF_ADD,  ei_padd);
+  CHECK_CWISE2(REF_SUB,  ei_psub);
+  CHECK_CWISE2(REF_MUL,  ei_pmul);
  #ifndef EIGEN_VECTORIZE_ALTIVEC
  if (!ei_is_same_type<Scalar,int>::ret)
-    CHECK_CWISE(REF_DIV,  ei_pdiv);
+    CHECK_CWISE2(REF_DIV,  ei_pdiv);
  #endif
-  CHECK_CWISE(std::min, ei_pmin);
-  CHECK_CWISE(std::max, ei_pmax);
+  CHECK_CWISE2(std::min, ei_pmin);
+  CHECK_CWISE2(std::max, ei_pmax);
+  CHECK_CWISE1(ei_abs, ei_pabs);

  for (int i=0; i<PacketSize; ++i)
    ref[i] = data1[0];
@ -124,17 +132,17 @@ template<typename Scalar> void packetmath()
  for (int i=0; i<PacketSize; ++i)
    ref[0] += data1[i];
  VERIFY(ei_isApprox(ref[0], ei_predux(ei_pload(data1))) && "ei_predux");
-  
+
  ref[0] = 1;
  for (int i=0; i<PacketSize; ++i)
    ref[0] *= data1[i];
  VERIFY(ei_isApprox(ref[0], ei_predux_mul(ei_pload(data1))) && "ei_predux_mul");
-  
+
  ref[0] = data1[0];
  for (int i=0; i<PacketSize; ++i)
    ref[0] = std::min(ref[0],data1[i]);
  VERIFY(ei_isApprox(ref[0], ei_predux_min(ei_pload(data1))) && "ei_predux_min");
-  
+
  ref[0] = data1[0];
  for (int i=0; i<PacketSize; ++i)
    ref[0] = std::max(ref[0],data1[i]);