mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-01-24 14:45:14 +08:00
* patch from Konstantinos Margaritis: bugfix in Altivec version of ei_pdiv
and various cleaning in Altivec code. Altivec vectorization have been re-enabled in CoreDeclaration * added copy constructors in non empty functors because I observed weird behavior with std::complex<>
This commit is contained in:
parent
7ce70e1437
commit
8f9d30cb20
@ -23,15 +23,12 @@
|
||||
#ifdef __SSSE3__
|
||||
#include <tmmintrin.h>
|
||||
#endif
|
||||
|
||||
/*** Disable AltiVec code for now as it's out of date
|
||||
#elif (defined __ALTIVEC__)
|
||||
#define EIGEN_VECTORIZE
|
||||
#define EIGEN_VECTORIZE_ALTIVEC
|
||||
#include <altivec.h>
|
||||
// We _need_ to #undef bool as it's defined in <altivec.h> for some reason.
|
||||
#undef bool
|
||||
***/
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
@ -36,6 +36,8 @@
|
||||
template<typename Scalar>
|
||||
struct ei_scalar_add_op {
|
||||
typedef typename ei_packet_traits<Scalar>::type PacketScalar;
|
||||
// FIXME default copy constructors seems bugged with std::complex<>
|
||||
inline ei_scalar_add_op(const ei_scalar_add_op& other) : m_other(other.m_other) { }
|
||||
inline ei_scalar_add_op(const Scalar& other) : m_other(other) { }
|
||||
inline Scalar operator() (const Scalar& a) const { return a + m_other; }
|
||||
inline const PacketScalar packetOp(const PacketScalar& a) const
|
||||
@ -131,6 +133,8 @@ struct ei_functor_traits<ei_scalar_sin_op<Scalar> >
|
||||
*/
|
||||
template<typename Scalar>
|
||||
struct ei_scalar_pow_op {
|
||||
// FIXME default copy constructors seems bugged with std::complex<>
|
||||
inline ei_scalar_pow_op(const ei_scalar_pow_op& other) : m_exponent(other.m_exponent) { }
|
||||
inline ei_scalar_pow_op(const Scalar& exponent) : m_exponent(exponent) {}
|
||||
inline Scalar operator() (const Scalar& a) const { return ei_pow(a, m_exponent); }
|
||||
const Scalar m_exponent;
|
||||
|
@ -143,8 +143,6 @@ struct ei_functor_traits<ei_scalar_quotient_op<Scalar> > {
|
||||
PacketAccess = ei_packet_traits<Scalar>::size>1
|
||||
#if (defined EIGEN_VECTORIZE_SSE)
|
||||
&& NumTraits<Scalar>::HasFloatingPoint
|
||||
#elif (defined EIGEN_VECTORIZE_ALTIVEC)
|
||||
&& 0
|
||||
#endif
|
||||
};
|
||||
};
|
||||
@ -260,6 +258,8 @@ struct ei_functor_traits<ei_scalar_real_op<Scalar> >
|
||||
template<typename Scalar>
|
||||
struct ei_scalar_multiple_op {
|
||||
typedef typename ei_packet_traits<Scalar>::type PacketScalar;
|
||||
// FIXME default copy constructors seems bugged with std::complex<>
|
||||
inline ei_scalar_multiple_op(const ei_scalar_multiple_op& other) : m_other(other.m_other) { }
|
||||
inline ei_scalar_multiple_op(const Scalar& other) : m_other(other) { }
|
||||
inline Scalar operator() (const Scalar& a) const { return a * m_other; }
|
||||
inline const PacketScalar packetOp(const PacketScalar& a) const
|
||||
@ -273,6 +273,8 @@ struct ei_functor_traits<ei_scalar_multiple_op<Scalar> >
|
||||
template<typename Scalar, bool HasFloatingPoint>
|
||||
struct ei_scalar_quotient1_impl {
|
||||
typedef typename ei_packet_traits<Scalar>::type PacketScalar;
|
||||
// FIXME default copy constructors seems bugged with std::complex<>
|
||||
inline ei_scalar_quotient1_impl(const ei_scalar_quotient1_impl& other) : m_other(other.m_other) { }
|
||||
inline ei_scalar_quotient1_impl(const Scalar& other) : m_other(static_cast<Scalar>(1) / other) {}
|
||||
inline Scalar operator() (const Scalar& a) const { return a * m_other; }
|
||||
inline const PacketScalar packetOp(const PacketScalar& a) const
|
||||
@ -285,10 +287,11 @@ struct ei_functor_traits<ei_scalar_quotient1_impl<Scalar,true> >
|
||||
|
||||
template<typename Scalar>
|
||||
struct ei_scalar_quotient1_impl<Scalar,false> {
|
||||
// FIXME default copy constructors seems bugged with std::complex<>
|
||||
inline ei_scalar_quotient1_impl(const ei_scalar_quotient1_impl& other) : m_other(other.m_other) { }
|
||||
inline ei_scalar_quotient1_impl(const Scalar& other) : m_other(other) {}
|
||||
inline Scalar operator() (const Scalar& a) const { return a / m_other; }
|
||||
const Scalar m_other;
|
||||
enum { Cost = 2 * NumTraits<Scalar>::MulCost };
|
||||
};
|
||||
template<typename Scalar>
|
||||
struct ei_functor_traits<ei_scalar_quotient1_impl<Scalar,false> >
|
||||
|
@ -2,7 +2,6 @@
|
||||
// for linear algebra. Eigen itself is part of the KDE project.
|
||||
//
|
||||
// Copyright (C) 2008 Konstantinos Margaritis <markos@codex.gr>
|
||||
// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
|
||||
//
|
||||
// Eigen is free software; you can redistribute it and/or
|
||||
// modify it under the terms of the GNU Lesser General Public
|
||||
@ -35,11 +34,16 @@ typedef vector int v4i;
|
||||
typedef vector unsigned int v4ui;
|
||||
typedef vector __bool int v4bi;
|
||||
|
||||
static const v4i v0i = vec_splat_s32(0);
|
||||
static const v4i v1i = vec_splat_s32(1);
|
||||
static const v4i v16i_ = vec_splat_s32(-16);
|
||||
static const v4f v0f = (v4f) v0i;
|
||||
static const v4f v1f = (v4f) v1i;
|
||||
// We don't want to write the same code all the time, but we need to reuse the constants
|
||||
// and it doesn't really work to declare them global, so we define macros instead
|
||||
|
||||
#define USE_CONST_v0i const v4i v0i = vec_splat_s32(0)
|
||||
#define USE_CONST_v1i const v4i v1i = vec_splat_s32(1)
|
||||
#define USE_CONST_v16i_ const v4i v16i_ = vec_splat_s32(-16)
|
||||
#define USE_CONST_v0f USE_CONST_v0i; const v4f v0f = (v4f) v0i;
|
||||
#define USE_CONST_v1f USE_CONST_v1i; const v4f v1f = vec_ctf(v1i, 0);
|
||||
#define USE_CONST_v1i_ const v4ui v1i_ = vec_splat_u32(-1);
|
||||
#define USE_CONST_v0f_ USE_CONST_v1i_; const v4f v0f_ = (v4f) vec_sl(v1i_, v1i_);
|
||||
|
||||
template<> struct ei_packet_traits<float> { typedef v4f type; enum {size=4}; };
|
||||
template<> struct ei_packet_traits<int> { typedef v4i type; enum {size=4}; };
|
||||
@ -54,7 +58,7 @@ std::ostream & operator <<(std::ostream & s, const v4f & v)
|
||||
float n[4];
|
||||
} vt;
|
||||
vt.v = v;
|
||||
s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3] << std::endl;
|
||||
s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
|
||||
return s;
|
||||
}
|
||||
|
||||
@ -65,7 +69,7 @@ std::ostream & operator <<(std::ostream & s, const v4i & v)
|
||||
int n[4];
|
||||
} vt;
|
||||
vt.v = v;
|
||||
s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3] << std::endl;
|
||||
s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
|
||||
return s;
|
||||
}
|
||||
|
||||
@ -76,7 +80,7 @@ std::ostream & operator <<(std::ostream & s, const v4ui & v)
|
||||
unsigned int n[4];
|
||||
} vt;
|
||||
vt.v = v;
|
||||
s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3] << std::endl;
|
||||
s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
|
||||
return s;
|
||||
}
|
||||
|
||||
@ -87,7 +91,7 @@ std::ostream & operator <<(std::ostream & s, const v4bi & v)
|
||||
unsigned int n[4];
|
||||
} vt;
|
||||
vt.v = v;
|
||||
s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3] << std::endl;
|
||||
s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
|
||||
return s;
|
||||
}
|
||||
|
||||
@ -97,71 +101,57 @@ template<> inline v4i ei_padd(const v4i& a, const v4i& b) { return vec_add(
|
||||
template<> inline v4f ei_psub(const v4f& a, const v4f& b) { return vec_sub(a,b); }
|
||||
template<> inline v4i ei_psub(const v4i& a, const v4i& b) { return vec_sub(a,b); }
|
||||
|
||||
template<> inline v4f ei_pmul(const v4f& a, const v4f& b) { return vec_madd(a,b, v0f); }
|
||||
template<> inline v4f ei_pmul(const v4f& a, const v4f& b) { USE_CONST_v0f; return vec_madd(a,b, v0f); }
|
||||
template<> inline v4i ei_pmul(const v4i& a, const v4i& b)
|
||||
{
|
||||
// Taken from http://developer.apple.com/hardwaredrivers/ve/algorithms.html#Multiply32
|
||||
//Set up constants
|
||||
v4i bswap, low_prod, high_prod, prod, prod_, a_, b_, a1, b1;
|
||||
v4i v1_, v0_, v1sel;
|
||||
v1_ = (v4i) vec_splat_u32(-1);
|
||||
v0_ = (v4i) vec_sl((v4ui) v1_, (v4ui) v1_);
|
||||
|
||||
// std::cout << "a: " << a << std::endl;
|
||||
// std::cout << "b: " << b << std::endl;
|
||||
a_ = vec_sub(v0i, a);
|
||||
b_ = vec_sub(v0i, b);
|
||||
a1 = vec_max(a, a_);
|
||||
b1 = vec_max(b, b_);
|
||||
v4bi sgn = (v4bi) vec_cmplt(vec_and(vec_xor(a, b), v0_), v0i);
|
||||
// Detailed in: http://freevec.org/content/32bit_signed_integer_multiplication_altivec
|
||||
//Set up constants, variables
|
||||
v4i a1, b1, bswap, low_prod, high_prod, prod, prod_, v1sel;
|
||||
USE_CONST_v0i;
|
||||
USE_CONST_v1i;
|
||||
USE_CONST_v16i_;
|
||||
|
||||
// std::cout << "a: " << a1 << std::endl;
|
||||
// std::cout << "b: " << b1 << std::endl;
|
||||
// std::cout << "a_: " << a_ << std::endl;
|
||||
// std::cout << "b_: " << b_ << std::endl;
|
||||
// std::cout << "sgn(a*b): " << std::hex << sgn << std::dec << std::endl;
|
||||
// Get the absolute values
|
||||
a1 = vec_abs(a);
|
||||
b1 = vec_abs(b);
|
||||
|
||||
// Do real work
|
||||
// Get the signs using xor
|
||||
v4bi sgn = (v4bi) vec_cmplt(vec_xor(a, b), v0i);
|
||||
|
||||
// Do the multiplication for the asbolute values.
|
||||
bswap = (v4i) vec_rl((v4ui) b1, (v4ui) v16i_ );
|
||||
low_prod = vec_mulo((vector short)a1, (vector short)b1);
|
||||
high_prod = vec_msum((vector short)a1, (vector short)bswap, v0i);
|
||||
high_prod = (v4i) vec_sl((v4ui) high_prod, (v4ui) v16i_);
|
||||
prod = vec_add( low_prod, high_prod );
|
||||
// std::cout << "prod: " << prod << std::endl;
|
||||
// std::cout << "v0i: " << v0i << std::endl;
|
||||
|
||||
// NOR the product and select only the negative elements according to the sign mask
|
||||
prod_ = vec_nor(prod, prod);
|
||||
prod_ = vec_sel(v0i, prod_, sgn);
|
||||
|
||||
// Add 1 to the result to get the negative numbers
|
||||
v1sel = vec_sel(v0i, v1i, sgn);
|
||||
// std::cout << "prod_: " << prod_ << std::endl;
|
||||
// std::cout << "v1sel: " << v1sel << std::endl;
|
||||
prod_ = vec_add(prod_, v1sel);
|
||||
// std::cout << "prod_: " << prod_ << std::endl;
|
||||
|
||||
// Merge the results back to the final vector.
|
||||
prod = vec_sel(prod, prod_, sgn);
|
||||
// std::cout << "final prod: " << prod << std::endl;
|
||||
|
||||
return prod;
|
||||
}
|
||||
|
||||
template<> inline v4f ei_pdiv(const v4f& a, const v4f& b) {
|
||||
std::cout << "a: " << a << std::endl;
|
||||
std::cout << "b: " << b << std::endl;
|
||||
v4f t, y_0, y_1, res;
|
||||
USE_CONST_v0f;
|
||||
USE_CONST_v1f;
|
||||
|
||||
// Altivec does not offer a divide instruction, we have to do a reciprocal approximation
|
||||
v4f y = vec_re(b);
|
||||
std::cout << "1/b: " << y << std::endl;
|
||||
y_0 = vec_re(b);
|
||||
|
||||
// Do one Newton-Raphson iteration to get the needed accuracy
|
||||
t = vec_nmsub(y_0, b, v1f);
|
||||
y_1 = vec_madd(y_0, t, y_0);
|
||||
|
||||
// Set up some constants for inverse reciprocals
|
||||
vector unsigned int v1_;
|
||||
v4f v0_, t;
|
||||
v1_ = vec_splat_u32(-1);
|
||||
std::cout << "-1: " << (v4i) v1_ << std::endl;
|
||||
v0_ = (v4f) vec_sl(v1_, v1_);
|
||||
std::cout << "-0.0: " << v0_ << std::endl;
|
||||
// Do a Newton-Raphson iteration to get the needed accuracy
|
||||
t = vec_nmsub(y, b, v1f);
|
||||
std::cout << "t: " << t << std::endl;
|
||||
y = vec_nmsub(t, y, v0_);
|
||||
v4f res = vec_madd(a, y, v0_);
|
||||
std::cout << "res: " << res << std::endl;
|
||||
res = vec_madd(a, y_1, v0f);
|
||||
return res;
|
||||
}
|
||||
|
||||
@ -184,7 +174,7 @@ template<> inline v4f ei_ploadu(const float* from)
|
||||
MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword
|
||||
LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword
|
||||
mask = vec_lvsl(0, from); // create the permute mask
|
||||
return (v4f) vec_perm(MSQ, LSQ, mask); // align the data
|
||||
return (v4f) vec_perm(MSQ, LSQ, mask); // align the data
|
||||
}
|
||||
|
||||
template<> inline v4i ei_ploadu(const int* from)
|
||||
@ -336,6 +326,7 @@ inline v4i ei_preduxp(const v4i* vecs)
|
||||
|
||||
inline int ei_predux(const v4i& a)
|
||||
{
|
||||
USE_CONST_v0i;
|
||||
v4i sum;
|
||||
sum = vec_sums(a, v0i);
|
||||
sum = vec_sld(sum, v0i, 12);
|
||||
|
Loading…
Reference in New Issue
Block a user