* patch from Konstantinos Margaritis: bugfix in Altivec version of ei_pdiv

and various cleaning in Altivec code. Altivec vectorization have been re-enabled
  in CoreDeclaration
* added copy constructors in non empty functors because I observed weird behavior with
  std::complex<>
This commit is contained in:
Gael Guennebaud 2008-08-25 16:22:56 +00:00
parent 7ce70e1437
commit 8f9d30cb20
4 changed files with 56 additions and 61 deletions

View File

@ -23,15 +23,12 @@
#ifdef __SSSE3__
#include <tmmintrin.h>
#endif
/*** Disable AltiVec code for now as it's out of date
#elif (defined __ALTIVEC__)
#define EIGEN_VECTORIZE
#define EIGEN_VECTORIZE_ALTIVEC
#include <altivec.h>
// We _need_ to #undef bool as it's defined in <altivec.h> for some reason.
#undef bool
***/
#endif
#endif

View File

@ -36,6 +36,8 @@
template<typename Scalar>
struct ei_scalar_add_op {
typedef typename ei_packet_traits<Scalar>::type PacketScalar;
// FIXME default copy constructors seems bugged with std::complex<>
inline ei_scalar_add_op(const ei_scalar_add_op& other) : m_other(other.m_other) { }
inline ei_scalar_add_op(const Scalar& other) : m_other(other) { }
inline Scalar operator() (const Scalar& a) const { return a + m_other; }
inline const PacketScalar packetOp(const PacketScalar& a) const
@ -131,6 +133,8 @@ struct ei_functor_traits<ei_scalar_sin_op<Scalar> >
*/
template<typename Scalar>
struct ei_scalar_pow_op {
// FIXME default copy constructors seems bugged with std::complex<>
inline ei_scalar_pow_op(const ei_scalar_pow_op& other) : m_exponent(other.m_exponent) { }
inline ei_scalar_pow_op(const Scalar& exponent) : m_exponent(exponent) {}
inline Scalar operator() (const Scalar& a) const { return ei_pow(a, m_exponent); }
const Scalar m_exponent;

View File

@ -143,8 +143,6 @@ struct ei_functor_traits<ei_scalar_quotient_op<Scalar> > {
PacketAccess = ei_packet_traits<Scalar>::size>1
#if (defined EIGEN_VECTORIZE_SSE)
&& NumTraits<Scalar>::HasFloatingPoint
#elif (defined EIGEN_VECTORIZE_ALTIVEC)
&& 0
#endif
};
};
@ -260,6 +258,8 @@ struct ei_functor_traits<ei_scalar_real_op<Scalar> >
template<typename Scalar>
struct ei_scalar_multiple_op {
typedef typename ei_packet_traits<Scalar>::type PacketScalar;
// FIXME default copy constructors seems bugged with std::complex<>
inline ei_scalar_multiple_op(const ei_scalar_multiple_op& other) : m_other(other.m_other) { }
inline ei_scalar_multiple_op(const Scalar& other) : m_other(other) { }
inline Scalar operator() (const Scalar& a) const { return a * m_other; }
inline const PacketScalar packetOp(const PacketScalar& a) const
@ -273,6 +273,8 @@ struct ei_functor_traits<ei_scalar_multiple_op<Scalar> >
template<typename Scalar, bool HasFloatingPoint>
struct ei_scalar_quotient1_impl {
typedef typename ei_packet_traits<Scalar>::type PacketScalar;
// FIXME default copy constructors seems bugged with std::complex<>
inline ei_scalar_quotient1_impl(const ei_scalar_quotient1_impl& other) : m_other(other.m_other) { }
inline ei_scalar_quotient1_impl(const Scalar& other) : m_other(static_cast<Scalar>(1) / other) {}
inline Scalar operator() (const Scalar& a) const { return a * m_other; }
inline const PacketScalar packetOp(const PacketScalar& a) const
@ -285,10 +287,11 @@ struct ei_functor_traits<ei_scalar_quotient1_impl<Scalar,true> >
template<typename Scalar>
struct ei_scalar_quotient1_impl<Scalar,false> {
// FIXME default copy constructors seems bugged with std::complex<>
inline ei_scalar_quotient1_impl(const ei_scalar_quotient1_impl& other) : m_other(other.m_other) { }
inline ei_scalar_quotient1_impl(const Scalar& other) : m_other(other) {}
inline Scalar operator() (const Scalar& a) const { return a / m_other; }
const Scalar m_other;
enum { Cost = 2 * NumTraits<Scalar>::MulCost };
};
template<typename Scalar>
struct ei_functor_traits<ei_scalar_quotient1_impl<Scalar,false> >

View File

@ -2,7 +2,6 @@
// for linear algebra. Eigen itself is part of the KDE project.
//
// Copyright (C) 2008 Konstantinos Margaritis <markos@codex.gr>
// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
//
// Eigen is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
@ -35,11 +34,16 @@ typedef vector int v4i;
typedef vector unsigned int v4ui;
typedef vector __bool int v4bi;
static const v4i v0i = vec_splat_s32(0);
static const v4i v1i = vec_splat_s32(1);
static const v4i v16i_ = vec_splat_s32(-16);
static const v4f v0f = (v4f) v0i;
static const v4f v1f = (v4f) v1i;
// We don't want to write the same code all the time, but we need to reuse the constants
// and it doesn't really work to declare them global, so we define macros instead
#define USE_CONST_v0i const v4i v0i = vec_splat_s32(0)
#define USE_CONST_v1i const v4i v1i = vec_splat_s32(1)
#define USE_CONST_v16i_ const v4i v16i_ = vec_splat_s32(-16)
#define USE_CONST_v0f USE_CONST_v0i; const v4f v0f = (v4f) v0i;
#define USE_CONST_v1f USE_CONST_v1i; const v4f v1f = vec_ctf(v1i, 0);
#define USE_CONST_v1i_ const v4ui v1i_ = vec_splat_u32(-1);
#define USE_CONST_v0f_ USE_CONST_v1i_; const v4f v0f_ = (v4f) vec_sl(v1i_, v1i_);
template<> struct ei_packet_traits<float> { typedef v4f type; enum {size=4}; };
template<> struct ei_packet_traits<int> { typedef v4i type; enum {size=4}; };
@ -54,7 +58,7 @@ std::ostream & operator <<(std::ostream & s, const v4f & v)
float n[4];
} vt;
vt.v = v;
s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3] << std::endl;
s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
return s;
}
@ -65,7 +69,7 @@ std::ostream & operator <<(std::ostream & s, const v4i & v)
int n[4];
} vt;
vt.v = v;
s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3] << std::endl;
s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
return s;
}
@ -76,7 +80,7 @@ std::ostream & operator <<(std::ostream & s, const v4ui & v)
unsigned int n[4];
} vt;
vt.v = v;
s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3] << std::endl;
s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
return s;
}
@ -87,7 +91,7 @@ std::ostream & operator <<(std::ostream & s, const v4bi & v)
unsigned int n[4];
} vt;
vt.v = v;
s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3] << std::endl;
s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
return s;
}
@ -97,71 +101,57 @@ template<> inline v4i ei_padd(const v4i& a, const v4i& b) { return vec_add(
template<> inline v4f ei_psub(const v4f& a, const v4f& b) { return vec_sub(a,b); }
template<> inline v4i ei_psub(const v4i& a, const v4i& b) { return vec_sub(a,b); }
template<> inline v4f ei_pmul(const v4f& a, const v4f& b) { return vec_madd(a,b, v0f); }
template<> inline v4f ei_pmul(const v4f& a, const v4f& b) { USE_CONST_v0f; return vec_madd(a,b, v0f); }
template<> inline v4i ei_pmul(const v4i& a, const v4i& b)
{
// Taken from http://developer.apple.com/hardwaredrivers/ve/algorithms.html#Multiply32
//Set up constants
v4i bswap, low_prod, high_prod, prod, prod_, a_, b_, a1, b1;
v4i v1_, v0_, v1sel;
v1_ = (v4i) vec_splat_u32(-1);
v0_ = (v4i) vec_sl((v4ui) v1_, (v4ui) v1_);
// std::cout << "a: " << a << std::endl;
// std::cout << "b: " << b << std::endl;
a_ = vec_sub(v0i, a);
b_ = vec_sub(v0i, b);
a1 = vec_max(a, a_);
b1 = vec_max(b, b_);
v4bi sgn = (v4bi) vec_cmplt(vec_and(vec_xor(a, b), v0_), v0i);
// Detailed in: http://freevec.org/content/32bit_signed_integer_multiplication_altivec
//Set up constants, variables
v4i a1, b1, bswap, low_prod, high_prod, prod, prod_, v1sel;
USE_CONST_v0i;
USE_CONST_v1i;
USE_CONST_v16i_;
// std::cout << "a: " << a1 << std::endl;
// std::cout << "b: " << b1 << std::endl;
// std::cout << "a_: " << a_ << std::endl;
// std::cout << "b_: " << b_ << std::endl;
// std::cout << "sgn(a*b): " << std::hex << sgn << std::dec << std::endl;
// Get the absolute values
a1 = vec_abs(a);
b1 = vec_abs(b);
// Do real work
// Get the signs using xor
v4bi sgn = (v4bi) vec_cmplt(vec_xor(a, b), v0i);
// Do the multiplication for the asbolute values.
bswap = (v4i) vec_rl((v4ui) b1, (v4ui) v16i_ );
low_prod = vec_mulo((vector short)a1, (vector short)b1);
high_prod = vec_msum((vector short)a1, (vector short)bswap, v0i);
high_prod = (v4i) vec_sl((v4ui) high_prod, (v4ui) v16i_);
prod = vec_add( low_prod, high_prod );
// std::cout << "prod: " << prod << std::endl;
// std::cout << "v0i: " << v0i << std::endl;
// NOR the product and select only the negative elements according to the sign mask
prod_ = vec_nor(prod, prod);
prod_ = vec_sel(v0i, prod_, sgn);
// Add 1 to the result to get the negative numbers
v1sel = vec_sel(v0i, v1i, sgn);
// std::cout << "prod_: " << prod_ << std::endl;
// std::cout << "v1sel: " << v1sel << std::endl;
prod_ = vec_add(prod_, v1sel);
// std::cout << "prod_: " << prod_ << std::endl;
// Merge the results back to the final vector.
prod = vec_sel(prod, prod_, sgn);
// std::cout << "final prod: " << prod << std::endl;
return prod;
}
template<> inline v4f ei_pdiv(const v4f& a, const v4f& b) {
std::cout << "a: " << a << std::endl;
std::cout << "b: " << b << std::endl;
v4f t, y_0, y_1, res;
USE_CONST_v0f;
USE_CONST_v1f;
// Altivec does not offer a divide instruction, we have to do a reciprocal approximation
v4f y = vec_re(b);
std::cout << "1/b: " << y << std::endl;
y_0 = vec_re(b);
// Do one Newton-Raphson iteration to get the needed accuracy
t = vec_nmsub(y_0, b, v1f);
y_1 = vec_madd(y_0, t, y_0);
// Set up some constants for inverse reciprocals
vector unsigned int v1_;
v4f v0_, t;
v1_ = vec_splat_u32(-1);
std::cout << "-1: " << (v4i) v1_ << std::endl;
v0_ = (v4f) vec_sl(v1_, v1_);
std::cout << "-0.0: " << v0_ << std::endl;
// Do a Newton-Raphson iteration to get the needed accuracy
t = vec_nmsub(y, b, v1f);
std::cout << "t: " << t << std::endl;
y = vec_nmsub(t, y, v0_);
v4f res = vec_madd(a, y, v0_);
std::cout << "res: " << res << std::endl;
res = vec_madd(a, y_1, v0f);
return res;
}
@ -184,7 +174,7 @@ template<> inline v4f ei_ploadu(const float* from)
MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword
LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword
mask = vec_lvsl(0, from); // create the permute mask
return (v4f) vec_perm(MSQ, LSQ, mask); // align the data
return (v4f) vec_perm(MSQ, LSQ, mask); // align the data
}
template<> inline v4i ei_ploadu(const int* from)
@ -336,6 +326,7 @@ inline v4i ei_preduxp(const v4i* vecs)
inline int ei_predux(const v4i& a)
{
USE_CONST_v0i;
v4i sum;
sum = vec_sums(a, v0i);
sum = vec_sld(sum, v0i, 12);