mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-01-30 17:40:05 +08:00
Added a ei_palign function align a packet from two others.
This allows much faster code dealing with unligned as in the updated matrix-vector product functions.
This commit is contained in:
parent
55aeb1f83a
commit
6d11a07e5e
@ -30,6 +30,7 @@
|
||||
|
||||
#include <cstdlib>
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <complex>
|
||||
#include <cassert>
|
||||
#include <functional>
|
||||
|
@ -274,7 +274,6 @@ template<typename MatrixType, int BlockRows, int BlockCols> class Block<MatrixTy
|
||||
|
||||
inline const Scalar coeff(int row, int col) const
|
||||
{
|
||||
// std::cerr << "coeff(int row, int col)\n";
|
||||
if (IsRowMajor)
|
||||
return m_data_ptr[col + row * stride()];
|
||||
else
|
||||
|
@ -390,7 +390,7 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector(
|
||||
|
||||
const int alignmentStep = PacketSize>1 ? (PacketSize - lhsStride % PacketSize) & PacketAlignedMask : 0;
|
||||
int alignmentPattern = alignmentStep==0 ? AllAligned
|
||||
: alignmentStep==2 ? EvenAligned
|
||||
: alignmentStep==(PacketSize/2) ? EvenAligned
|
||||
: FirstAligned;
|
||||
|
||||
// we cannot assume the first element is aligned because of sub-matrices
|
||||
@ -432,7 +432,6 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector(
|
||||
if (PacketSize>1)
|
||||
{
|
||||
/* explicit vectorization */
|
||||
|
||||
// process initial unaligned coeffs
|
||||
for (int j=0; j<alignedStart; j++)
|
||||
res[j] += ei_pfirst(ptmp0)*lhs0[j] + ei_pfirst(ptmp1)*lhs1[j] + ei_pfirst(ptmp2)*lhs2[j] + ei_pfirst(ptmp3)*lhs3[j];
|
||||
@ -452,26 +451,41 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector(
|
||||
case FirstAligned:
|
||||
if(peels>1)
|
||||
{
|
||||
// NOTE peeling with two _EIGEN_ACCUMULATE_PACKETS() is much less efficient
|
||||
// than the following code
|
||||
asm("#mybegin");
|
||||
Packet A00, A01, A02, A03, A10, A11, A12, A13;
|
||||
if (alignmentStep==1)
|
||||
{
|
||||
A00 = ptmp1; ptmp1 = ptmp3; ptmp3 = A00;
|
||||
const Scalar* aux = lhs1;
|
||||
lhs1 = lhs3; lhs3 = aux;
|
||||
}
|
||||
|
||||
A01 = ei_pload(&lhs1[alignedStart-1]);
|
||||
A02 = ei_pload(&lhs2[alignedStart-2]);
|
||||
A03 = ei_pload(&lhs3[alignedStart-3]);
|
||||
|
||||
for (int j = alignedStart; j<peeledSize; j+=peels*PacketSize)
|
||||
{
|
||||
A01 = ei_ploadu(&lhs1[j]); A11 = ei_ploadu(&lhs1[j+PacketSize]);
|
||||
A02 = ei_ploadu(&lhs2[j]); A12 = ei_ploadu(&lhs2[j+PacketSize]);
|
||||
A00 = ei_pload (&lhs0[j]); A10 = ei_pload (&lhs0[j+PacketSize]);
|
||||
A11 = ei_pload(&lhs1[j-1+PacketSize]); ei_palign<1>(A01,A11);
|
||||
A12 = ei_pload(&lhs2[j-2+PacketSize]); ei_palign<2>(A02,A12);
|
||||
A13 = ei_pload(&lhs3[j-3+PacketSize]); ei_palign<3>(A03,A13);
|
||||
|
||||
A00 = ei_pload (&lhs0[j]);
|
||||
A10 = ei_pload (&lhs0[j+PacketSize]);
|
||||
A00 = ei_pmadd(ptmp0, A00, ei_pload(&res[j]));
|
||||
A10 = ei_pmadd(ptmp0, A10, ei_pload(&res[j+PacketSize]));
|
||||
|
||||
A00 = ei_pmadd(ptmp1, A01, A00); A10 = ei_pmadd(ptmp1, A11, A10);
|
||||
A03 = ei_ploadu(&lhs3[j]); A13 = ei_ploadu(&lhs3[j+PacketSize]);
|
||||
A00 = ei_pmadd(ptmp2, A02, A00); A10 = ei_pmadd(ptmp2, A12, A10);
|
||||
A00 = ei_pmadd(ptmp3, A03, A00); A10 = ei_pmadd(ptmp3, A13, A10);
|
||||
ei_pstore(&res[j],A00); ei_pstore(&res[j+PacketSize],A10);
|
||||
A00 = ei_pmadd(ptmp1, A01, A00);
|
||||
A01 = ei_pload(&lhs1[j-1+2*PacketSize]); ei_palign<1>(A11,A01);
|
||||
A00 = ei_pmadd(ptmp2, A02, A00);
|
||||
A02 = ei_pload(&lhs2[j-2+2*PacketSize]); ei_palign<2>(A12,A02);
|
||||
A00 = ei_pmadd(ptmp3, A03, A00);
|
||||
ei_pstore(&res[j],A00);
|
||||
A03 = ei_pload(&lhs3[j-3+2*PacketSize]); ei_palign<3>(A13,A03);
|
||||
A10 = ei_pmadd(ptmp1, A11, A10);
|
||||
A10 = ei_pmadd(ptmp2, A12, A10);
|
||||
A10 = ei_pmadd(ptmp3, A13, A10);
|
||||
ei_pstore(&res[j+PacketSize],A10);
|
||||
}
|
||||
asm("#myend");
|
||||
}
|
||||
for (int j = peeledSize; j<alignedSize; j+=PacketSize)
|
||||
_EIGEN_ACCUMULATE_PACKETS(,u,u,);
|
||||
@ -532,7 +546,6 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector(
|
||||
#undef _EIGEN_ACCUMULATE_PACKETS
|
||||
}
|
||||
|
||||
|
||||
// TODO add peeling to mask unaligned load/stores
|
||||
template<typename Scalar, typename ResType>
|
||||
EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector(
|
||||
@ -570,7 +583,7 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector(
|
||||
|
||||
const int alignmentStep = PacketSize>1 ? (PacketSize - lhsStride % PacketSize) & PacketAlignedMask : 0;
|
||||
int alignmentPattern = alignmentStep==0 ? AllAligned
|
||||
: alignmentStep==2 ? EvenAligned
|
||||
: alignmentStep==(PacketSize/2) ? EvenAligned
|
||||
: FirstAligned;
|
||||
|
||||
// we cannot assume the first element is aligned because of sub-matrices
|
||||
@ -636,27 +649,51 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector(
|
||||
case FirstAligned:
|
||||
if (peels>1)
|
||||
{
|
||||
Packet A01, A02, A03, b;
|
||||
/* Here we proccess 4 rows with with two peeled iterations to hide
|
||||
* tghe overhead of unaligned loads. Moreover unaligned loads are handled
|
||||
* using special shift/move operations between the two aligned packets
|
||||
* overlaping the desired unaligned packet. This is *much* more efficient
|
||||
* than basic unaligned loads.
|
||||
*/
|
||||
Packet A01, A02, A03, b, A11, A12, A13;
|
||||
if (alignmentStep==1)
|
||||
{
|
||||
// flip row #1 and #3
|
||||
b = ptmp1; ptmp1 = ptmp3; ptmp3 = b;
|
||||
const Scalar* aux = lhs1;
|
||||
lhs1 = lhs3; lhs3 = aux;
|
||||
}
|
||||
A01 = ei_pload(&lhs1[alignedStart-1]);
|
||||
A02 = ei_pload(&lhs2[alignedStart-2]);
|
||||
A03 = ei_pload(&lhs3[alignedStart-3]);
|
||||
|
||||
for (int j = alignedStart; j<peeledSize; j+=peels*PacketSize)
|
||||
{
|
||||
b = ei_pload(&rhs[j]);
|
||||
A01 = ei_ploadu(&lhs1[j]);
|
||||
A02 = ei_ploadu(&lhs2[j]);
|
||||
A03 = ei_ploadu(&lhs3[j]);
|
||||
A11 = ei_pload(&lhs1[j-1+PacketSize]); ei_palign<1>(A01,A11);
|
||||
A12 = ei_pload(&lhs2[j-2+PacketSize]); ei_palign<2>(A02,A12);
|
||||
A13 = ei_pload(&lhs3[j-3+PacketSize]); ei_palign<3>(A03,A13);
|
||||
|
||||
ptmp0 = ei_pmadd(b, ei_pload (&lhs0[j]), ptmp0);
|
||||
ptmp1 = ei_pmadd(b, A01, ptmp1);
|
||||
A01 = ei_ploadu(&lhs1[j+PacketSize]);
|
||||
A01 = ei_pload(&lhs1[j-1+2*PacketSize]); ei_palign<1>(A11,A01);
|
||||
ptmp2 = ei_pmadd(b, A02, ptmp2);
|
||||
A02 = ei_ploadu(&lhs2[j+PacketSize]);
|
||||
A02 = ei_pload(&lhs2[j-2+2*PacketSize]); ei_palign<2>(A12,A02);
|
||||
ptmp3 = ei_pmadd(b, A03, ptmp3);
|
||||
A03 = ei_ploadu(&lhs3[j+PacketSize]);
|
||||
A03 = ei_pload(&lhs3[j-3+2*PacketSize]); ei_palign<3>(A13,A03);
|
||||
|
||||
b = ei_pload(&rhs[j+PacketSize]);
|
||||
ptmp0 = ei_pmadd(b, ei_pload (&lhs0[j+PacketSize]), ptmp0);
|
||||
ptmp1 = ei_pmadd(b, A01, ptmp1);
|
||||
ptmp2 = ei_pmadd(b, A02, ptmp2);
|
||||
ptmp3 = ei_pmadd(b, A03, ptmp3);
|
||||
ptmp1 = ei_pmadd(b, A11, ptmp1);
|
||||
ptmp2 = ei_pmadd(b, A12, ptmp2);
|
||||
ptmp3 = ei_pmadd(b, A13, ptmp3);
|
||||
}
|
||||
if (alignmentStep==1)
|
||||
{
|
||||
// restore rows #1 and #3
|
||||
b = ptmp1; ptmp1 = ptmp3; ptmp3 = b;
|
||||
const Scalar* aux = lhs1;
|
||||
lhs1 = lhs3; lhs3 = aux;
|
||||
}
|
||||
}
|
||||
for (int j = peeledSize; j<alignedSize; j+=PacketSize)
|
||||
|
@ -134,5 +134,21 @@ inline static int ei_alignmentOffset(const Scalar* ptr, int maxOffset)
|
||||
: 0;
|
||||
}
|
||||
|
||||
/** \internal specialization of ei_palign() */
|
||||
template<int Offset,typename PacketType>
|
||||
struct ei_palign_impl
|
||||
{
|
||||
// by default data are aligned, so there is nothing to be done :)
|
||||
inline static void run(PacketType& first, const PacketType& second) {}
|
||||
};
|
||||
|
||||
/** \internal update \a first using the concatenation of the \a Offset last elements
|
||||
* of \a first and packet_size minus \a Offset first elements of \a second */
|
||||
template<int Offset,typename PacketType>
|
||||
inline void ei_palign(PacketType& first, const PacketType& second)
|
||||
{
|
||||
ei_palign_impl<Offset,PacketType>::run(first,second);
|
||||
}
|
||||
|
||||
#endif // EIGEN_DUMMY_PACKET_MATH_H
|
||||
|
||||
|
@ -111,8 +111,8 @@ template<typename Lhs, typename Rhs> struct ei_product_mode
|
||||
* This class represents an expression of the product of two matrices.
|
||||
* It is the return type of the operator* between matrices. Its template
|
||||
* arguments are determined automatically by ProductReturnType. Therefore,
|
||||
* Product should be used direclty. To determine the result type of a function
|
||||
* which involve a matrix product, use ProductReturnType::Type.
|
||||
* Product should never be used direclty. To determine the result type of a
|
||||
* function which involves a matrix product, use ProductReturnType::Type.
|
||||
*
|
||||
* \sa ProductReturnType, MatrixBase::operator*(const MatrixBase<OtherDerived>&)
|
||||
*/
|
||||
|
@ -90,6 +90,12 @@ template<> inline __m128d ei_pload(const double* from) { return _mm_load_pd(fro
|
||||
template<> inline __m128i ei_pload(const int* from) { return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
|
||||
|
||||
template<> inline __m128 ei_ploadu(const float* from) { return _mm_loadu_ps(from); }
|
||||
// template<> inline __m128 ei_ploadu(const float* from) {
|
||||
// if (size_t(from)&0xF)
|
||||
// return _mm_loadu_ps(from);
|
||||
// else
|
||||
// return _mm_loadu_ps(from);
|
||||
// }
|
||||
template<> inline __m128d ei_ploadu(const double* from) { return _mm_loadu_pd(from); }
|
||||
template<> inline __m128i ei_ploadu(const int* from) { return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from)); }
|
||||
|
||||
@ -198,7 +204,80 @@ inline __m128i ei_preduxp(const __m128i* vecs)
|
||||
// asm("mulps %[a], %[b] \n\taddps %[c], %[b]" : [b] "+x" (res) : [a] "x" (a), [c] "x" (c));
|
||||
// return res;
|
||||
// }
|
||||
// inline __m128i _mm_alignr_epi8(const __m128i& a, const __m128i& b, const int i)
|
||||
// {
|
||||
// __m128i res = a;
|
||||
// asm("palignr %[i], %[a], %[b] " : [b] "+x" (res) : [a] "x" (a), [i] "i" (i));
|
||||
// return res;
|
||||
// }
|
||||
#endif
|
||||
|
||||
#ifdef __SSSE3__
|
||||
// SSSE3 versions
|
||||
template<int Offset>
|
||||
struct ei_palign_impl<Offset,__m128>
|
||||
{
|
||||
inline static void run(__m128& first, const __m128& second)
|
||||
{
|
||||
first = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(first), _mm_castps_si128(second), (4-Offset)*4));
|
||||
}
|
||||
};
|
||||
|
||||
template<int Offset>
|
||||
struct ei_palign_impl<Offset,__m128i>
|
||||
{
|
||||
inline static void run(__m128i& first, const __m128i& second)
|
||||
{
|
||||
first = _mm_alignr_epi8(first, second, (4-Offset)*4);
|
||||
}
|
||||
};
|
||||
#else
|
||||
// SSE2 versions
|
||||
template<int Offset>
|
||||
struct ei_palign_impl<Offset,__m128>
|
||||
{
|
||||
inline static void run(__m128& first, const __m128& second)
|
||||
{
|
||||
if (Offset==1)
|
||||
{
|
||||
first = _mm_move_ss(first,second);
|
||||
first = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(first),0x39));
|
||||
}
|
||||
else if (Offset==2)
|
||||
{
|
||||
first = _mm_movehl_ps(first,first);
|
||||
first = _mm_movelh_ps(first,second);
|
||||
}
|
||||
else if (Offset==3)
|
||||
{
|
||||
first = _mm_move_ss(first,second);
|
||||
first = _mm_shuffle_ps(first,second,0x93);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<int Offset>
|
||||
struct ei_palign_impl<Offset,__m128i>
|
||||
{
|
||||
inline static void run(__m128i& first, const __m128i& second)
|
||||
{
|
||||
if (Offset==1)
|
||||
{
|
||||
first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
|
||||
first = _mm_shuffle_epi32(first,0x39);
|
||||
}
|
||||
else if (Offset==2)
|
||||
{
|
||||
first = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(first)));
|
||||
first = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
|
||||
}
|
||||
else if (Offset==3)
|
||||
{
|
||||
first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
|
||||
first = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second),0x93));
|
||||
}
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
#endif // EIGEN_PACKET_MATH_SSE_H
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user