* implement slice vectorization. Because it uses unaligned

packet access, it is not certain that it will bring a performance
  improvement: benchmarking needed.
* improve logic choosing slice vectorization.
* fix typo in SSE packet math, causing crash in unaligned case.
* fix bug in Product, causing crash in unaligned case.
* add TEST_SSE3 CMake option.
This commit is contained in:
Benoit Jacob 2008-06-22 15:02:05 +00:00
parent 8cef541b5a
commit 8a967fb17c
5 changed files with 64 additions and 34 deletions

View File

@ -22,6 +22,10 @@ IF(CMAKE_COMPILER_IS_GNUCXX)
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse2")
MESSAGE("Enabling SSE2 in tests/examples")
ENDIF(TEST_SSE2)
IF(TEST_SSE3)
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse3")
MESSAGE("Enabling SSE3 in tests/examples")
ENDIF(TEST_SSE3)
IF(TEST_ALTIVEC)
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maltivec -mabi=altivec")
MESSAGE("Enabling AltiVec in tests/examples")

View File

@ -52,6 +52,9 @@ private:
InnerSize = int(Derived::Flags)&RowMajorBit
? Derived::ColsAtCompileTime
: Derived::RowsAtCompileTime,
InnerMaxSize = int(Derived::Flags)&RowMajorBit
? Derived::MaxColsAtCompileTime
: Derived::MaxRowsAtCompileTime,
PacketSize = ei_packet_traits<typename Derived::Scalar>::size
};
@ -60,7 +63,9 @@ private:
&& ((int(Derived::Flags)&RowMajorBit)==(int(OtherDerived::Flags)&RowMajorBit)),
MayInnerVectorize = MightVectorize && InnerSize!=Dynamic && int(InnerSize)%int(PacketSize)==0,
MayLinearVectorize = MightVectorize && (int(Derived::Flags) & int(OtherDerived::Flags) & LinearAccessBit),
MaySliceVectorize = MightVectorize && InnerSize==Dynamic
MaySliceVectorize = MightVectorize && InnerMaxSize==Dynamic /* slice vectorization can be slow, so we only
want it if the slices are big, which is indicated by InnerMaxSize rather than InnerSize, think of the case
of a dynamic block in a fixed-size matrix */
};
public:
@ -349,7 +354,7 @@ struct ei_assign_impl<Derived1, Derived2, LinearVectorization, NoUnrolling>
template<typename Derived1, typename Derived2>
struct ei_assign_impl<Derived1, Derived2, LinearVectorization, CompleteUnrolling>
{
inline static void run(Derived1 &dst, const Derived2 &src)
static void run(Derived1 &dst, const Derived2 &src)
{
const int size = Derived1::SizeAtCompileTime;
const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size;
@ -383,8 +388,30 @@ struct ei_assign_impl<Derived1, Derived2, SliceVectorization, NoUnrolling>
{
static void run(Derived1 &dst, const Derived2 &src)
{
//FIXME unimplemented, so for now we fall back to non-vectorized path
ei_assign_impl<Derived1, Derived2, NoVectorization, NoUnrolling>::run(dst, src);
const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size;
const bool rowMajor = Derived1::Flags&RowMajorBit;
const int innerSize = rowMajor ? dst.cols() : dst.rows();
const int outerSize = rowMajor ? dst.rows() : dst.cols();
const int alignedInnerSize = (innerSize/packetSize)*packetSize;
for(int i = 0; i < outerSize; i++)
{
// do the vectorizable part of the assignment
for (int index = 0; index<alignedInnerSize ; index+=packetSize)
{
const int row = rowMajor ? i : index;
const int col = rowMajor ? index : i;
dst.template writePacket<UnAligned>(row, col, src.template packet<UnAligned>(row, col));
}
// do the non-vectorizable part of the assignment
for (int index = alignedInnerSize; index<innerSize ; index++)
{
const int row = rowMajor ? i : index;
const int col = rowMajor ? index : i;
dst.coeffRef(row, col) = src.coeff(row, col);
}
}
}
};

View File

@ -38,7 +38,7 @@ enum {
template<int VectorizationMode, int Index, typename Lhs, typename Rhs>
struct ei_product_coeff_impl;
template<int StorageOrder, int Index, typename Lhs, typename Rhs, typename PacketScalar>
template<int StorageOrder, int Index, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
struct ei_product_packet_impl;
template<typename T> class ei_product_eval_to_column_major;
@ -188,10 +188,6 @@ template<typename LhsNested, typename RhsNested, int ProductMode> class Product
Unroll ? InnerSize-1 : Dynamic,
_LhsNested, _RhsNested> ScalarCoeffImpl;
typedef ei_product_packet_impl<Flags&RowMajorBit ? RowMajorProduct : ColMajorProduct,
Unroll ? InnerSize-1 : Dynamic,
_LhsNested, _RhsNested, PacketScalar> PacketCoeffImpl;
public:
template<typename Lhs, typename Rhs>
@ -232,7 +228,10 @@ template<typename LhsNested, typename RhsNested, int ProductMode> class Product
const PacketScalar _packet(int row, int col) const
{
PacketScalar res;
PacketCoeffImpl::run(row, col, m_lhs, m_rhs, res);
ei_product_packet_impl<Flags&RowMajorBit ? RowMajorProduct : ColMajorProduct,
Unroll ? InnerSize-1 : Dynamic,
_LhsNested, _RhsNested, PacketScalar, LoadMode>
::run(row, col, m_lhs, m_rhs, res);
return res;
}
@ -356,63 +355,63 @@ struct ei_product_coeff_impl<InnerVectorization, Index, Lhs, Rhs>
*** Packet path ***
*******************/
template<int Index, typename Lhs, typename Rhs, typename PacketScalar>
struct ei_product_packet_impl<RowMajorProduct, Index, Lhs, Rhs, PacketScalar>
template<int Index, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
struct ei_product_packet_impl<RowMajorProduct, Index, Lhs, Rhs, PacketScalar, LoadMode>
{
inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
{
ei_product_packet_impl<RowMajorProduct, Index-1, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, res);
res = ei_pmadd(ei_pset1(lhs.coeff(row, Index)), rhs.template packet<Aligned>(Index, col), res);
ei_product_packet_impl<RowMajorProduct, Index-1, Lhs, Rhs, PacketScalar, LoadMode>::run(row, col, lhs, rhs, res);
res = ei_pmadd(ei_pset1(lhs.coeff(row, Index)), rhs.template packet<LoadMode>(Index, col), res);
}
};
template<int Index, typename Lhs, typename Rhs, typename PacketScalar>
struct ei_product_packet_impl<ColMajorProduct, Index, Lhs, Rhs, PacketScalar>
template<int Index, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
struct ei_product_packet_impl<ColMajorProduct, Index, Lhs, Rhs, PacketScalar, LoadMode>
{
inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
{
ei_product_packet_impl<ColMajorProduct, Index-1, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, res);
res = ei_pmadd(lhs.template packet<Aligned>(row, Index), ei_pset1(rhs.coeff(Index, col)), res);
ei_product_packet_impl<ColMajorProduct, Index-1, Lhs, Rhs, PacketScalar, LoadMode>::run(row, col, lhs, rhs, res);
res = ei_pmadd(lhs.template packet<LoadMode>(row, Index), ei_pset1(rhs.coeff(Index, col)), res);
}
};
template<typename Lhs, typename Rhs, typename PacketScalar>
struct ei_product_packet_impl<RowMajorProduct, 0, Lhs, Rhs, PacketScalar>
template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
struct ei_product_packet_impl<RowMajorProduct, 0, Lhs, Rhs, PacketScalar, LoadMode>
{
inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
{
res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet<Aligned>(0, col));
res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
}
};
template<typename Lhs, typename Rhs, typename PacketScalar>
struct ei_product_packet_impl<ColMajorProduct, 0, Lhs, Rhs, PacketScalar>
template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
struct ei_product_packet_impl<ColMajorProduct, 0, Lhs, Rhs, PacketScalar, LoadMode>
{
inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
{
res = ei_pmul(lhs.template packet<Aligned>(row, 0), ei_pset1(rhs.coeff(0, col)));
res = ei_pmul(lhs.template packet<LoadMode>(row, 0), ei_pset1(rhs.coeff(0, col)));
}
};
template<int StorageOrder, typename Lhs, typename Rhs, typename PacketScalar>
struct ei_product_packet_impl<StorageOrder, Dynamic, Lhs, Rhs, PacketScalar>
template<int StorageOrder, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
struct ei_product_packet_impl<StorageOrder, Dynamic, Lhs, Rhs, PacketScalar, LoadMode>
{
inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar& res)
{
res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet<Aligned>(0, col));
res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
for(int i = 1; i < lhs.cols(); i++)
res = ei_pmadd(ei_pset1(lhs.coeff(row, i)), rhs.template packet<Aligned>(i, col), res);
res = ei_pmadd(ei_pset1(lhs.coeff(row, i)), rhs.template packet<LoadMode>(i, col), res);
}
};
template<typename Lhs, typename Rhs, typename PacketScalar>
struct ei_product_packet_impl<ColMajorProduct, Dynamic, Lhs, Rhs, PacketScalar>
template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
struct ei_product_packet_impl<ColMajorProduct, Dynamic, Lhs, Rhs, PacketScalar, LoadMode>
{
inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar& res)
{
res = ei_pmul(lhs.template packet<Aligned>(row, 0), ei_pset1(rhs.coeff(0, col)));
res = ei_pmul(lhs.template packet<LoadMode>(row, 0), ei_pset1(rhs.coeff(0, col)));
for(int i = 1; i < lhs.cols(); i++)
res = ei_pmadd(lhs.template packet<Aligned>(row, i), ei_pset1(rhs.coeff(i, col)), res);
res = ei_pmadd(lhs.template packet<LoadMode>(row, i), ei_pset1(rhs.coeff(i, col)), res);
}
};

View File

@ -94,7 +94,7 @@ inline void ei_pstore(int* to, const __m128i& from) { _mm_store_si128(reinter
inline void ei_pstoreu(float* to, const __m128& from) { _mm_storeu_ps(to, from); }
inline void ei_pstoreu(double* to, const __m128d& from) { _mm_storeu_pd(to, from); }
inline void ei_pstoreu(int* to, const __m128i& from) { _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
inline void ei_pstoreu(int* to, const __m128i& from) { _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); }
inline float ei_pfirst(const __m128& a) { return _mm_cvtss_f32(a); }
inline double ei_pfirst(const __m128d& a) { return _mm_cvtsd_f64(a); }

View File

@ -11,7 +11,7 @@ ENDIF(CMAKE_COMPILER_IS_GNUCXX)
OPTION(EIGEN_NO_ASSERTION_CHECKING "Disable checking of assertions" OFF)
# similar to SET_TARGET_PROPERTIES but append the property instead of overwritting it
# similar to SET_TARGET_PROPERTIES but append the property instead of overwriting it
MACRO(EI_ADD_TARGET_PROPERTY target prop value)
GET_TARGET_PROPERTY(previous ${target} ${prop})