mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-04-06 19:10:36 +08:00
* implement slice vectorization. Because it uses unaligned
packet access, it is not certain that it will bring a performance improvement: benchmarking needed. * improve logic choosing slice vectorization. * fix typo in SSE packet math, causing crash in unaligned case. * fix bug in Product, causing crash in unaligned case. * add TEST_SSE3 CMake option.
This commit is contained in:
parent
8cef541b5a
commit
8a967fb17c
@ -22,6 +22,10 @@ IF(CMAKE_COMPILER_IS_GNUCXX)
|
||||
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse2")
|
||||
MESSAGE("Enabling SSE2 in tests/examples")
|
||||
ENDIF(TEST_SSE2)
|
||||
IF(TEST_SSE3)
|
||||
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse3")
|
||||
MESSAGE("Enabling SSE3 in tests/examples")
|
||||
ENDIF(TEST_SSE3)
|
||||
IF(TEST_ALTIVEC)
|
||||
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maltivec -mabi=altivec")
|
||||
MESSAGE("Enabling AltiVec in tests/examples")
|
||||
|
@ -52,6 +52,9 @@ private:
|
||||
InnerSize = int(Derived::Flags)&RowMajorBit
|
||||
? Derived::ColsAtCompileTime
|
||||
: Derived::RowsAtCompileTime,
|
||||
InnerMaxSize = int(Derived::Flags)&RowMajorBit
|
||||
? Derived::MaxColsAtCompileTime
|
||||
: Derived::MaxRowsAtCompileTime,
|
||||
PacketSize = ei_packet_traits<typename Derived::Scalar>::size
|
||||
};
|
||||
|
||||
@ -60,7 +63,9 @@ private:
|
||||
&& ((int(Derived::Flags)&RowMajorBit)==(int(OtherDerived::Flags)&RowMajorBit)),
|
||||
MayInnerVectorize = MightVectorize && InnerSize!=Dynamic && int(InnerSize)%int(PacketSize)==0,
|
||||
MayLinearVectorize = MightVectorize && (int(Derived::Flags) & int(OtherDerived::Flags) & LinearAccessBit),
|
||||
MaySliceVectorize = MightVectorize && InnerSize==Dynamic
|
||||
MaySliceVectorize = MightVectorize && InnerMaxSize==Dynamic /* slice vectorization can be slow, so we only
|
||||
want it if the slices are big, which is indicated by InnerMaxSize rather than InnerSize, think of the case
|
||||
of a dynamic block in a fixed-size matrix */
|
||||
};
|
||||
|
||||
public:
|
||||
@ -349,7 +354,7 @@ struct ei_assign_impl<Derived1, Derived2, LinearVectorization, NoUnrolling>
|
||||
template<typename Derived1, typename Derived2>
|
||||
struct ei_assign_impl<Derived1, Derived2, LinearVectorization, CompleteUnrolling>
|
||||
{
|
||||
inline static void run(Derived1 &dst, const Derived2 &src)
|
||||
static void run(Derived1 &dst, const Derived2 &src)
|
||||
{
|
||||
const int size = Derived1::SizeAtCompileTime;
|
||||
const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size;
|
||||
@ -383,8 +388,30 @@ struct ei_assign_impl<Derived1, Derived2, SliceVectorization, NoUnrolling>
|
||||
{
|
||||
static void run(Derived1 &dst, const Derived2 &src)
|
||||
{
|
||||
//FIXME unimplemented, so for now we fall back to non-vectorized path
|
||||
ei_assign_impl<Derived1, Derived2, NoVectorization, NoUnrolling>::run(dst, src);
|
||||
const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size;
|
||||
const bool rowMajor = Derived1::Flags&RowMajorBit;
|
||||
const int innerSize = rowMajor ? dst.cols() : dst.rows();
|
||||
const int outerSize = rowMajor ? dst.rows() : dst.cols();
|
||||
const int alignedInnerSize = (innerSize/packetSize)*packetSize;
|
||||
|
||||
for(int i = 0; i < outerSize; i++)
|
||||
{
|
||||
// do the vectorizable part of the assignment
|
||||
for (int index = 0; index<alignedInnerSize ; index+=packetSize)
|
||||
{
|
||||
const int row = rowMajor ? i : index;
|
||||
const int col = rowMajor ? index : i;
|
||||
dst.template writePacket<UnAligned>(row, col, src.template packet<UnAligned>(row, col));
|
||||
}
|
||||
|
||||
// do the non-vectorizable part of the assignment
|
||||
for (int index = alignedInnerSize; index<innerSize ; index++)
|
||||
{
|
||||
const int row = rowMajor ? i : index;
|
||||
const int col = rowMajor ? index : i;
|
||||
dst.coeffRef(row, col) = src.coeff(row, col);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -38,7 +38,7 @@ enum {
|
||||
template<int VectorizationMode, int Index, typename Lhs, typename Rhs>
|
||||
struct ei_product_coeff_impl;
|
||||
|
||||
template<int StorageOrder, int Index, typename Lhs, typename Rhs, typename PacketScalar>
|
||||
template<int StorageOrder, int Index, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
|
||||
struct ei_product_packet_impl;
|
||||
|
||||
template<typename T> class ei_product_eval_to_column_major;
|
||||
@ -188,10 +188,6 @@ template<typename LhsNested, typename RhsNested, int ProductMode> class Product
|
||||
Unroll ? InnerSize-1 : Dynamic,
|
||||
_LhsNested, _RhsNested> ScalarCoeffImpl;
|
||||
|
||||
typedef ei_product_packet_impl<Flags&RowMajorBit ? RowMajorProduct : ColMajorProduct,
|
||||
Unroll ? InnerSize-1 : Dynamic,
|
||||
_LhsNested, _RhsNested, PacketScalar> PacketCoeffImpl;
|
||||
|
||||
public:
|
||||
|
||||
template<typename Lhs, typename Rhs>
|
||||
@ -232,7 +228,10 @@ template<typename LhsNested, typename RhsNested, int ProductMode> class Product
|
||||
const PacketScalar _packet(int row, int col) const
|
||||
{
|
||||
PacketScalar res;
|
||||
PacketCoeffImpl::run(row, col, m_lhs, m_rhs, res);
|
||||
ei_product_packet_impl<Flags&RowMajorBit ? RowMajorProduct : ColMajorProduct,
|
||||
Unroll ? InnerSize-1 : Dynamic,
|
||||
_LhsNested, _RhsNested, PacketScalar, LoadMode>
|
||||
::run(row, col, m_lhs, m_rhs, res);
|
||||
return res;
|
||||
}
|
||||
|
||||
@ -356,63 +355,63 @@ struct ei_product_coeff_impl<InnerVectorization, Index, Lhs, Rhs>
|
||||
*** Packet path ***
|
||||
*******************/
|
||||
|
||||
template<int Index, typename Lhs, typename Rhs, typename PacketScalar>
|
||||
struct ei_product_packet_impl<RowMajorProduct, Index, Lhs, Rhs, PacketScalar>
|
||||
template<int Index, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
|
||||
struct ei_product_packet_impl<RowMajorProduct, Index, Lhs, Rhs, PacketScalar, LoadMode>
|
||||
{
|
||||
inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
|
||||
{
|
||||
ei_product_packet_impl<RowMajorProduct, Index-1, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, res);
|
||||
res = ei_pmadd(ei_pset1(lhs.coeff(row, Index)), rhs.template packet<Aligned>(Index, col), res);
|
||||
ei_product_packet_impl<RowMajorProduct, Index-1, Lhs, Rhs, PacketScalar, LoadMode>::run(row, col, lhs, rhs, res);
|
||||
res = ei_pmadd(ei_pset1(lhs.coeff(row, Index)), rhs.template packet<LoadMode>(Index, col), res);
|
||||
}
|
||||
};
|
||||
|
||||
template<int Index, typename Lhs, typename Rhs, typename PacketScalar>
|
||||
struct ei_product_packet_impl<ColMajorProduct, Index, Lhs, Rhs, PacketScalar>
|
||||
template<int Index, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
|
||||
struct ei_product_packet_impl<ColMajorProduct, Index, Lhs, Rhs, PacketScalar, LoadMode>
|
||||
{
|
||||
inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
|
||||
{
|
||||
ei_product_packet_impl<ColMajorProduct, Index-1, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, res);
|
||||
res = ei_pmadd(lhs.template packet<Aligned>(row, Index), ei_pset1(rhs.coeff(Index, col)), res);
|
||||
ei_product_packet_impl<ColMajorProduct, Index-1, Lhs, Rhs, PacketScalar, LoadMode>::run(row, col, lhs, rhs, res);
|
||||
res = ei_pmadd(lhs.template packet<LoadMode>(row, Index), ei_pset1(rhs.coeff(Index, col)), res);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Lhs, typename Rhs, typename PacketScalar>
|
||||
struct ei_product_packet_impl<RowMajorProduct, 0, Lhs, Rhs, PacketScalar>
|
||||
template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
|
||||
struct ei_product_packet_impl<RowMajorProduct, 0, Lhs, Rhs, PacketScalar, LoadMode>
|
||||
{
|
||||
inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
|
||||
{
|
||||
res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet<Aligned>(0, col));
|
||||
res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Lhs, typename Rhs, typename PacketScalar>
|
||||
struct ei_product_packet_impl<ColMajorProduct, 0, Lhs, Rhs, PacketScalar>
|
||||
template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
|
||||
struct ei_product_packet_impl<ColMajorProduct, 0, Lhs, Rhs, PacketScalar, LoadMode>
|
||||
{
|
||||
inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
|
||||
{
|
||||
res = ei_pmul(lhs.template packet<Aligned>(row, 0), ei_pset1(rhs.coeff(0, col)));
|
||||
res = ei_pmul(lhs.template packet<LoadMode>(row, 0), ei_pset1(rhs.coeff(0, col)));
|
||||
}
|
||||
};
|
||||
|
||||
template<int StorageOrder, typename Lhs, typename Rhs, typename PacketScalar>
|
||||
struct ei_product_packet_impl<StorageOrder, Dynamic, Lhs, Rhs, PacketScalar>
|
||||
template<int StorageOrder, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
|
||||
struct ei_product_packet_impl<StorageOrder, Dynamic, Lhs, Rhs, PacketScalar, LoadMode>
|
||||
{
|
||||
inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar& res)
|
||||
{
|
||||
res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet<Aligned>(0, col));
|
||||
res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
|
||||
for(int i = 1; i < lhs.cols(); i++)
|
||||
res = ei_pmadd(ei_pset1(lhs.coeff(row, i)), rhs.template packet<Aligned>(i, col), res);
|
||||
res = ei_pmadd(ei_pset1(lhs.coeff(row, i)), rhs.template packet<LoadMode>(i, col), res);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Lhs, typename Rhs, typename PacketScalar>
|
||||
struct ei_product_packet_impl<ColMajorProduct, Dynamic, Lhs, Rhs, PacketScalar>
|
||||
template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
|
||||
struct ei_product_packet_impl<ColMajorProduct, Dynamic, Lhs, Rhs, PacketScalar, LoadMode>
|
||||
{
|
||||
inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar& res)
|
||||
{
|
||||
res = ei_pmul(lhs.template packet<Aligned>(row, 0), ei_pset1(rhs.coeff(0, col)));
|
||||
res = ei_pmul(lhs.template packet<LoadMode>(row, 0), ei_pset1(rhs.coeff(0, col)));
|
||||
for(int i = 1; i < lhs.cols(); i++)
|
||||
res = ei_pmadd(lhs.template packet<Aligned>(row, i), ei_pset1(rhs.coeff(i, col)), res);
|
||||
res = ei_pmadd(lhs.template packet<LoadMode>(row, i), ei_pset1(rhs.coeff(i, col)), res);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -94,7 +94,7 @@ inline void ei_pstore(int* to, const __m128i& from) { _mm_store_si128(reinter
|
||||
|
||||
inline void ei_pstoreu(float* to, const __m128& from) { _mm_storeu_ps(to, from); }
|
||||
inline void ei_pstoreu(double* to, const __m128d& from) { _mm_storeu_pd(to, from); }
|
||||
inline void ei_pstoreu(int* to, const __m128i& from) { _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
|
||||
inline void ei_pstoreu(int* to, const __m128i& from) { _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); }
|
||||
|
||||
inline float ei_pfirst(const __m128& a) { return _mm_cvtss_f32(a); }
|
||||
inline double ei_pfirst(const __m128d& a) { return _mm_cvtsd_f64(a); }
|
||||
|
@ -11,7 +11,7 @@ ENDIF(CMAKE_COMPILER_IS_GNUCXX)
|
||||
|
||||
OPTION(EIGEN_NO_ASSERTION_CHECKING "Disable checking of assertions" OFF)
|
||||
|
||||
# similar to SET_TARGET_PROPERTIES but append the property instead of overwritting it
|
||||
# similar to SET_TARGET_PROPERTIES but append the property instead of overwriting it
|
||||
MACRO(EI_ADD_TARGET_PROPERTY target prop value)
|
||||
|
||||
GET_TARGET_PROPERTY(previous ${target} ${prop})
|
||||
|
Loading…
x
Reference in New Issue
Block a user