Removed ei_pload1, use posix_memalign to allocate aligned memory,

and make Product ok when only one side is vectorizable (and the product
is still vectorized)
This commit is contained in:
Gael Guennebaud 2008-05-02 13:30:12 +00:00
parent e19f9bc523
commit 102e029dad
4 changed files with 73 additions and 35 deletions

View File

@ -10,6 +10,11 @@
#endif
#endif
#ifdef EIGEN_VECTORIZE
// it seems we cannot assume posix_memalign is defined in the stdlib header
extern "C" int posix_memalign (void **, size_t, size_t) throw ();
#endif
#include <cstdlib>
#include <cmath>
#include <complex>

View File

@ -54,7 +54,13 @@ T* ei_aligned_malloc(size_t size)
{
#ifdef EIGEN_VECTORIZE
if (ei_packet_traits<T>::size>1)
return static_cast<T*>(_mm_malloc(sizeof(T)*size, 16));
{
void* ptr;
if (posix_memalign(&ptr, 16, size*sizeof(T))==0)
return static_cast<T*>(ptr);
else
return 0;
}
else
#endif
return new T[size];
@ -65,7 +71,7 @@ void ei_aligned_free(T* ptr)
{
#ifdef EIGEN_VECTORIZE
if (ei_packet_traits<T>::size>1)
_mm_free(ptr);
free(ptr);
else
#endif
delete[] ptr;

View File

@ -90,10 +90,6 @@ inline __m128 ei_pload(const float* from) { return _mm_load_ps(from); }
inline __m128d ei_pload(const double* from) { return _mm_load_pd(from); }
inline __m128i ei_pload(const int* from) { return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
inline __m128 ei_pload1(const float* from) { return _mm_load1_ps(from); }
inline __m128d ei_pload1(const double* from) { return _mm_load1_pd(from); }
inline __m128i ei_pload1(const int* from) { return _mm_set1_epi32(*from); }
inline __m128 ei_pset1(const float& from) { return _mm_set1_ps(from); }
inline __m128d ei_pset1(const double& from) { return _mm_set1_pd(from); }
inline __m128i ei_pset1(const int& from) { return _mm_set1_epi32(from); }

View File

@ -60,29 +60,44 @@ struct ei_product_unroller<Index, 0, Lhs, Rhs>
static void run(int, int, const Lhs&, const Rhs&, typename Lhs::Scalar&) {}
};
template<bool RowMajor, int Index, int Size, typename Lhs, typename Rhs, typename PacketScalar>
struct ei_packet_product_unroller
struct ei_packet_product_unroller;
template<int Index, int Size, typename Lhs, typename Rhs, typename PacketScalar>
struct ei_packet_product_unroller<true, Index, Size, Lhs, Rhs, PacketScalar>
{
static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
{
ei_packet_product_unroller<RowMajor, Index-1, Size, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, res);
if (RowMajor)
res = ei_padd(res, ei_pmul(ei_pset1(lhs.coeff(row, Index)), rhs.packetCoeff(Index, col)));
else
res = ei_padd(res, ei_pmul(lhs.packetCoeff(row, Index), ei_pset1(rhs.coeff(Index, col))));
ei_packet_product_unroller<true, Index-1, Size, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, res);
res = ei_padd(res, ei_pmul(ei_pset1(lhs.coeff(row, Index)), rhs.packetCoeff(Index, col)));
}
};
template<bool RowMajor, int Size, typename Lhs, typename Rhs, typename PacketScalar>
struct ei_packet_product_unroller<RowMajor, 0, Size, Lhs, Rhs, PacketScalar>
template<int Index, int Size, typename Lhs, typename Rhs, typename PacketScalar>
struct ei_packet_product_unroller<false, Index, Size, Lhs, Rhs, PacketScalar>
{
static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
{
if (RowMajor)
res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.packetCoeff(0, col));
else
res = ei_pmul(lhs.packetCoeff(row, 0), ei_pset1(rhs.coeff(0, col)));
ei_packet_product_unroller<false, Index-1, Size, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, res);
res = ei_padd(res, ei_pmul(lhs.packetCoeff(row, Index), ei_pset1(rhs.coeff(Index, col))));
}
};
template<int Size, typename Lhs, typename Rhs, typename PacketScalar>
struct ei_packet_product_unroller<true, 0, Size, Lhs, Rhs, PacketScalar>
{
static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
{
res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.packetCoeff(0, col));
}
};
template<int Size, typename Lhs, typename Rhs, typename PacketScalar>
struct ei_packet_product_unroller<false, 0, Size, Lhs, Rhs, PacketScalar>
{
static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
{
res = ei_pmul(lhs.packetCoeff(row, 0), ei_pset1(rhs.coeff(0, col)));
}
};
@ -92,6 +107,16 @@ struct ei_packet_product_unroller<RowMajor, Index, Dynamic, Lhs, Rhs, PacketScal
static void run(int, int, const Lhs&, const Rhs&, PacketScalar&) {}
};
template<typename Product, bool RowMajor = true> struct ProductPacketCoeffImpl {
inline static typename Product::PacketScalar execute(const Product& product, int row, int col)
{ return product._packetCoeffRowMajor(row,col); }
};
template<typename Product> struct ProductPacketCoeffImpl<Product, false> {
inline static typename Product::PacketScalar execute(const Product& product, int row, int col)
{ return product._packetCoeffColumnMajor(row,col); }
};
/** \class Product
*
* \brief Expression of the product of two matrices
@ -158,6 +183,7 @@ template<typename Lhs, typename Rhs, int EvalMode> class Product : ei_no_assignm
public:
EIGEN_GENERIC_PUBLIC_INTERFACE(Product)
friend class ProductPacketCoeffImpl<Product,Flags&RowMajorBit>;
typedef typename ei_traits<Product>::LhsNested LhsNested;
typedef typename ei_traits<Product>::RhsNested RhsNested;
typedef typename ei_traits<Product>::_LhsNested _LhsNested;
@ -202,32 +228,37 @@ template<typename Lhs, typename Rhs, int EvalMode> class Product : ei_no_assignm
return res;
}
PacketScalar _packetCoeff(int row, int col) const EIGEN_ALWAYS_INLINE
PacketScalar _packetCoeff(int row, int col) const
{
PacketScalar res;
if(Lhs::ColsAtCompileTime <= EIGEN_UNROLLING_LIMIT)
{
PacketScalar res;
ei_packet_product_unroller<Flags&RowMajorBit, Lhs::ColsAtCompileTime-1,
Lhs::ColsAtCompileTime <= EIGEN_UNROLLING_LIMIT
? Lhs::ColsAtCompileTime : Dynamic,
_LhsNested, _RhsNested, PacketScalar>
::run(row, col, m_lhs, m_rhs, res);
return res;
}
else
{
if (Flags&RowMajorBit)
{
res = ei_pmul(ei_pset1(m_lhs.coeff(row, 0)),m_rhs.packetCoeff(0, col));
for(int i = 1; i < m_lhs.cols(); i++)
res = ei_padd(res, ei_pmul(ei_pset1(m_lhs.coeff(row, i)), m_rhs.packetCoeff(i, col)));
}
else
{
res = ei_pmul(m_lhs.packetCoeff(row, 0), ei_pset1(m_rhs.coeff(0, col)));
for(int i = 1; i < m_lhs.cols(); i++)
res = ei_padd(res, ei_pmul(m_lhs.packetCoeff(row, i), ei_pset1(m_rhs.coeff(i, col))));
}
}
return ProductPacketCoeffImpl<Product,Flags&RowMajorBit>::execute(*this, row, col);
}
PacketScalar _packetCoeffRowMajor(int row, int col) const
{
PacketScalar res;
res = ei_pmul(ei_pset1(m_lhs.coeff(row, 0)),m_rhs.packetCoeff(0, col));
for(int i = 1; i < m_lhs.cols(); i++)
res = ei_padd(res, ei_pmul(ei_pset1(m_lhs.coeff(row, i)), m_rhs.packetCoeff(i, col)));
return res;
}
PacketScalar _packetCoeffColumnMajor(int row, int col) const
{
PacketScalar res;
res = ei_pmul(m_lhs.packetCoeff(row, 0), ei_pset1(m_rhs.coeff(0, col)));
for(int i = 1; i < m_lhs.cols(); i++)
res = ei_padd(res, ei_pmul(m_lhs.packetCoeff(row, i), ei_pset1(m_rhs.coeff(i, col))));
return res;
}