mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-03-07 18:27:40 +08:00
New gebp kernel handling up to 3 packets x 4 register-level blocks. Huge speeup on Haswell.
This changeset also introduce new vector functions: ploadquad and predux4.
This commit is contained in:
parent
feaf7c7e6d
commit
d5a795f673
@ -161,14 +161,6 @@ pload(const typename unpacket_traits<Packet>::type* from) { return *from; }
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||
ploadu(const typename unpacket_traits<Packet>::type* from) { return *from; }
|
||||
|
||||
/** \internal \returns a packet with elements of \a *from duplicated.
|
||||
* For instance, for a packet of 8 elements, 4 scalar will be read from \a *from and
|
||||
* duplicated to form: {from[0],from[0],from[1],from[1],,from[2],from[2],,from[3],from[3]}
|
||||
* Currently, this function is only used for scalar * complex products.
|
||||
*/
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||
ploaddup(const typename unpacket_traits<Packet>::type* from) { return *from; }
|
||||
|
||||
/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||
pset1(const typename unpacket_traits<Packet>::type& a) { return a; }
|
||||
@ -177,6 +169,24 @@ pset1(const typename unpacket_traits<Packet>::type& a) { return a; }
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||
pload1(const typename unpacket_traits<Packet>::type *a) { return pset1<Packet>(*a); }
|
||||
|
||||
/** \internal \returns a packet with elements of \a *from duplicated.
|
||||
* For instance, for a packet of 8 elements, 4 scalars will be read from \a *from and
|
||||
* duplicated to form: {from[0],from[0],from[1],from[1],from[2],from[2],from[3],from[3]}
|
||||
* Currently, this function is only used for scalar * complex products.
|
||||
*/
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||
ploaddup(const typename unpacket_traits<Packet>::type* from) { return *from; }
|
||||
|
||||
/** \internal \returns a packet with elements of \a *from quadrupled.
|
||||
* For instance, for a packet of 8 elements, 2 scalars will be read from \a *from and
|
||||
* replicated to form: {from[0],from[0],from[0],from[0],from[1],from[1],from[1],from[1]}
|
||||
* Currently, this function is only used in matrix products.
|
||||
* For packet-size smaller or equal to 4, this function is equivalent to pload1
|
||||
*/
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||
ploadquad(const typename unpacket_traits<Packet>::type* from)
|
||||
{ return pload1<Packet>(from); }
|
||||
|
||||
/** \internal equivalent to
|
||||
* \code
|
||||
* a0 = pload1(a+0);
|
||||
@ -249,6 +259,15 @@ preduxp(const Packet* vecs) { return vecs[0]; }
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux(const Packet& a)
|
||||
{ return a; }
|
||||
|
||||
/** \internal \returns the sum of the elements of \a a by block of 4 elements.
|
||||
* For a packet {a0, a1, a2, a3, a4, a5, a6, a7}, it returns a half packet {a0+a4, a1+a5, a2+a6, a3+a7}
|
||||
* For packet-size smaller or equal to 4, this boils down to a noop.
|
||||
*/
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC inline
|
||||
typename conditional<(unpacket_traits<Packet>::size%8)==0,typename unpacket_traits<Packet>::half,Packet>::type
|
||||
predux4(const Packet& a)
|
||||
{ return a; }
|
||||
|
||||
/** \internal \returns the product of the elements of \a a*/
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_mul(const Packet& a)
|
||||
{ return a; }
|
||||
|
@ -45,7 +45,7 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits
|
||||
};
|
||||
};
|
||||
|
||||
template<> struct unpacket_traits<Packet4cf> { typedef std::complex<float> type; enum {size=4}; };
|
||||
template<> struct unpacket_traits<Packet4cf> { typedef std::complex<float> type; enum {size=4}; typedef Packet2cf half; };
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4cf padd<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_add_ps(a.v,b.v)); }
|
||||
template<> EIGEN_STRONG_INLINE Packet4cf psub<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_sub_ps(a.v,b.v)); }
|
||||
@ -271,7 +271,7 @@ template<> struct packet_traits<std::complex<double> > : default_packet_traits
|
||||
};
|
||||
};
|
||||
|
||||
template<> struct unpacket_traits<Packet2cd> { typedef std::complex<double> type; enum {size=2}; };
|
||||
template<> struct unpacket_traits<Packet2cd> { typedef std::complex<double> type; enum {size=2}; typedef Packet1cd half; };
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2cd padd<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_add_pd(a.v,b.v)); }
|
||||
template<> EIGEN_STRONG_INLINE Packet2cd psub<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_sub_pd(a.v,b.v)); }
|
||||
|
@ -83,9 +83,9 @@ template<> struct packet_traits<int> : default_packet_traits
|
||||
};
|
||||
*/
|
||||
|
||||
template<> struct unpacket_traits<Packet8f> { typedef float type; enum {size=8}; };
|
||||
template<> struct unpacket_traits<Packet4d> { typedef double type; enum {size=4}; };
|
||||
template<> struct unpacket_traits<Packet8i> { typedef int type; enum {size=8}; };
|
||||
template<> struct unpacket_traits<Packet8f> { typedef float type; typedef Packet4f half; enum {size=8}; };
|
||||
template<> struct unpacket_traits<Packet4d> { typedef double type; typedef Packet2d half; enum {size=4}; };
|
||||
template<> struct unpacket_traits<Packet8i> { typedef int type; typedef Packet4i half; enum {size=8}; };
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float& from) { return _mm256_set1_ps(from); }
|
||||
template<> EIGEN_STRONG_INLINE Packet4d pset1<Packet4d>(const double& from) { return _mm256_set1_pd(from); }
|
||||
@ -141,7 +141,16 @@ template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f&
|
||||
return _mm256_fmadd_ps(a,b,c);
|
||||
#endif
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) { return _mm256_fmadd_pd(a,b,c); }
|
||||
template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) {
|
||||
#if defined(__clang__) || defined(__GNUC__)
|
||||
// see above
|
||||
Packet4d res = c;
|
||||
asm("vfmadd231pd %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
|
||||
return res;
|
||||
#else
|
||||
return _mm256_fmadd_pd(a,b,c);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8f pmin<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_min_ps(a,b); }
|
||||
@ -189,6 +198,13 @@ template<> EIGEN_STRONG_INLINE Packet4d ploaddup<Packet4d>(const double* from)
|
||||
return _mm256_blend_pd(tmp1,_mm256_permute2f128_pd(tmp2,tmp2,1),12);
|
||||
}
|
||||
|
||||
// Loads 2 floats from memory a returns the packet {a0, a0 a0, a0, a1, a1, a1, a1}
|
||||
template<> EIGEN_STRONG_INLINE Packet8f ploadquad<Packet8f>(const float* from)
|
||||
{
|
||||
Packet8f tmp = _mm256_castps128_ps256(_mm_broadcast_ss(from));
|
||||
return _mm256_insertf128_ps(tmp, _mm_broadcast_ss(from+1), 1);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet8f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ps(to, from); }
|
||||
template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_pd(to, from); }
|
||||
template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet8i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); }
|
||||
@ -345,6 +361,11 @@ template<> EIGEN_STRONG_INLINE double predux<Packet4d>(const Packet4d& a)
|
||||
return pfirst(_mm256_hadd_pd(tmp0,tmp0));
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f predux4<Packet8f>(const Packet8f& a)
|
||||
{
|
||||
return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1));
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE float predux_mul<Packet8f>(const Packet8f& a)
|
||||
{
|
||||
Packet8f tmp;
|
||||
|
@ -99,8 +99,8 @@ template<> struct packet_traits<int> : default_packet_traits
|
||||
};
|
||||
};
|
||||
|
||||
template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; };
|
||||
template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; };
|
||||
template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; typedef Packet4f half; };
|
||||
template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; typedef Packet4i half; };
|
||||
/*
|
||||
inline std::ostream & operator <<(std::ostream & s, const Packet4f & v)
|
||||
{
|
||||
|
@ -47,7 +47,7 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits
|
||||
};
|
||||
};
|
||||
|
||||
template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; };
|
||||
template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; typedef Packet2cf half; };
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from)
|
||||
{
|
||||
|
@ -101,8 +101,8 @@ EIGEN_STRONG_INLINE void vst1q_f32(float* to, float32x4_t from) { ::vst1q
|
||||
EIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); }
|
||||
#endif
|
||||
|
||||
template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; };
|
||||
template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; };
|
||||
template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; typedef Packet4f half; };
|
||||
template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; typedef Packet4i half; };
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return vdupq_n_f32(from); }
|
||||
template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return vdupq_n_s32(from); }
|
||||
|
@ -49,7 +49,7 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits
|
||||
};
|
||||
#endif
|
||||
|
||||
template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; };
|
||||
template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; typedef Packet2cf half; };
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); }
|
||||
template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); }
|
||||
@ -296,7 +296,7 @@ template<> struct packet_traits<std::complex<double> > : default_packet_traits
|
||||
};
|
||||
#endif
|
||||
|
||||
template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1}; };
|
||||
template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1}; typedef Packet1cd half; };
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); }
|
||||
template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); }
|
||||
|
@ -107,9 +107,9 @@ template<> struct packet_traits<int> : default_packet_traits
|
||||
};
|
||||
};
|
||||
|
||||
template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; };
|
||||
template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2}; };
|
||||
template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; };
|
||||
template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; typedef Packet4f half; };
|
||||
template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2}; typedef Packet2d half; };
|
||||
template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; typedef Packet4i half; };
|
||||
|
||||
#if defined(_MSC_VER) && (_MSC_VER==1500)
|
||||
// Workaround MSVC 9 internal compiler error.
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -23,6 +23,8 @@ template<
|
||||
typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs>
|
||||
struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor>
|
||||
{
|
||||
typedef gebp_traits<RhsScalar,LhsScalar> Traits;
|
||||
|
||||
typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
||||
static EIGEN_STRONG_INLINE void run(
|
||||
Index rows, Index cols, Index depth,
|
||||
@ -51,6 +53,8 @@ template<
|
||||
struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor>
|
||||
{
|
||||
|
||||
typedef gebp_traits<LhsScalar,RhsScalar> Traits;
|
||||
|
||||
typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
||||
static void run(Index rows, Index cols, Index depth,
|
||||
const LhsScalar* _lhs, Index lhsStride,
|
||||
@ -63,11 +67,9 @@ static void run(Index rows, Index cols, Index depth,
|
||||
const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
|
||||
const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
|
||||
|
||||
typedef gebp_traits<LhsScalar,RhsScalar> Traits;
|
||||
|
||||
Index kc = blocking.kc(); // cache block size along the K direction
|
||||
Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
|
||||
//Index nc = blocking.nc(); // cache block size along the N direction
|
||||
Index nc = (std::min)(cols,blocking.nc()); // cache block size along the N direction
|
||||
|
||||
gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
|
||||
gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
|
||||
@ -80,66 +82,68 @@ static void run(Index rows, Index cols, Index depth,
|
||||
Index tid = omp_get_thread_num();
|
||||
Index threads = omp_get_num_threads();
|
||||
|
||||
std::size_t sizeA = kc*mc;
|
||||
ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, 0);
|
||||
LhsScalar* blockA = blocking.blockA();
|
||||
eigen_internal_assert(blockA!=0);
|
||||
|
||||
RhsScalar* blockB = blocking.blockB();
|
||||
eigen_internal_assert(blockB!=0);
|
||||
|
||||
std::size_t sizeB = kc*nc;
|
||||
ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, 0);
|
||||
|
||||
// For each horizontal panel of the rhs, and corresponding vertical panel of the lhs...
|
||||
for(Index k=0; k<depth; k+=kc)
|
||||
{
|
||||
const Index actual_kc = (std::min)(k+kc,depth)-k; // => rows of B', and cols of the A'
|
||||
|
||||
// In order to reduce the chance that a thread has to wait for the other,
|
||||
// let's start by packing A'.
|
||||
pack_lhs(blockA, &lhs(0,k), lhsStride, actual_kc, mc);
|
||||
// let's start by packing B'.
|
||||
pack_rhs(blockB, &rhs(k,0), rhsStride, actual_kc, nc);
|
||||
|
||||
// Pack B_k to B' in a parallel fashion:
|
||||
// each thread packs the sub block B_k,j to B'_j where j is the thread id.
|
||||
// Pack A_k to A' in a parallel fashion:
|
||||
// each thread packs the sub block A_k,i to A'_i where i is the thread id.
|
||||
|
||||
// However, before copying to B'_j, we have to make sure that no other thread is still using it,
|
||||
// However, before copying to A'_i, we have to make sure that no other thread is still using it,
|
||||
// i.e., we test that info[tid].users equals 0.
|
||||
// Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it.
|
||||
while(info[tid].users!=0) {}
|
||||
info[tid].users += threads;
|
||||
|
||||
pack_lhs(blockA+info[tid].lhs_start*actual_kc, &lhs(info[tid].lhs_start,k), lhsStride, actual_kc, info[tid].lhs_length);
|
||||
|
||||
pack_rhs(blockB+info[tid].rhs_start*actual_kc, &rhs(k,info[tid].rhs_start), rhsStride, actual_kc, info[tid].rhs_length);
|
||||
|
||||
// Notify the other threads that the part B'_j is ready to go.
|
||||
// Notify the other threads that the part A'_i is ready to go.
|
||||
info[tid].sync = k;
|
||||
|
||||
// Computes C_i += A' * B' per B'_j
|
||||
|
||||
// Computes C_i += A' * B' per A'_i
|
||||
for(Index shift=0; shift<threads; ++shift)
|
||||
{
|
||||
Index j = (tid+shift)%threads;
|
||||
Index i = (tid+shift)%threads;
|
||||
|
||||
// At this point we have to make sure that B'_j has been updated by the thread j,
|
||||
// At this point we have to make sure that A'_i has been updated by the thread i,
|
||||
// we use testAndSetOrdered to mimic a volatile access.
|
||||
// However, no need to wait for the B' part which has been updated by the current thread!
|
||||
if(shift>0)
|
||||
while(info[j].sync!=k) {}
|
||||
|
||||
gebp(res+info[j].rhs_start*resStride, resStride, blockA, blockB+info[j].rhs_start*actual_kc, mc, actual_kc, info[j].rhs_length, alpha, -1,-1,0,0);
|
||||
while(info[i].sync!=k) {}
|
||||
gebp(res+info[i].lhs_start, resStride, blockA+info[i].lhs_start*actual_kc, blockB, info[i].lhs_length, actual_kc, nc, alpha);
|
||||
}
|
||||
|
||||
// Then keep going as usual with the remaining A'
|
||||
for(Index i=mc; i<rows; i+=mc)
|
||||
// Then keep going as usual with the remaining B'
|
||||
for(Index j=nc; j<cols; j+=nc)
|
||||
{
|
||||
const Index actual_mc = (std::min)(i+mc,rows)-i;
|
||||
const Index actual_nc = (std::min)(j+nc,cols)-j;
|
||||
|
||||
// pack A_i,k to A'
|
||||
pack_lhs(blockA, &lhs(i,k), lhsStride, actual_kc, actual_mc);
|
||||
// pack B_k,j to B'
|
||||
pack_rhs(blockB, &rhs(k,j), rhsStride, actual_kc, actual_nc);
|
||||
|
||||
// C_i += A' * B'
|
||||
gebp(res+i, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1,-1,0,0);
|
||||
// C_j += A' * B'
|
||||
gebp(res+j*resStride, resStride, blockA, blockB, rows, actual_kc, actual_nc, alpha);
|
||||
}
|
||||
|
||||
// Release all the sub blocks B'_j of B' for the current thread,
|
||||
// Release all the sub blocks A'_i of A' for the current thread,
|
||||
// i.e., we simply decrement the number of users by 1
|
||||
for(Index j=0; j<threads; ++j)
|
||||
#pragma omp critical
|
||||
{
|
||||
for(Index i=0; i<threads; ++i)
|
||||
#pragma omp atomic
|
||||
--(info[j].users);
|
||||
--(info[i].users);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
@ -149,36 +153,34 @@ static void run(Index rows, Index cols, Index depth,
|
||||
|
||||
// this is the sequential version!
|
||||
std::size_t sizeA = kc*mc;
|
||||
std::size_t sizeB = kc*cols;
|
||||
std::size_t sizeB = kc*nc;
|
||||
|
||||
ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA());
|
||||
ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());
|
||||
|
||||
// For each horizontal panel of the rhs, and corresponding panel of the lhs...
|
||||
// (==GEMM_VAR1)
|
||||
for(Index k2=0; k2<depth; k2+=kc)
|
||||
{
|
||||
const Index actual_kc = (std::min)(k2+kc,depth)-k2;
|
||||
|
||||
// OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
|
||||
// => Pack rhs's panel into a sequential chunk of memory (L2 caching)
|
||||
// Note that this panel will be read as many times as the number of blocks in the lhs's
|
||||
// vertical panel which is, in practice, a very low number.
|
||||
pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, cols);
|
||||
// => Pack lhs's panel into a sequential chunk of memory (L2/L3 caching)
|
||||
// Note that this panel will be read as many times as the number of blocks in the rhs's
|
||||
// horizontal panel which is, in practice, a very low number.
|
||||
pack_lhs(blockA, &lhs(0,k2), lhsStride, actual_kc, rows);
|
||||
|
||||
// For each mc x kc block of the lhs's vertical panel...
|
||||
// (==GEPP_VAR1)
|
||||
for(Index i2=0; i2<rows; i2+=mc)
|
||||
// For each kc x nc block of the rhs's horizontal panel...
|
||||
for(Index j2=0; j2<cols; j2+=nc)
|
||||
{
|
||||
const Index actual_mc = (std::min)(i2+mc,rows)-i2;
|
||||
const Index actual_nc = (std::min)(j2+nc,cols)-j2;
|
||||
|
||||
// We pack the lhs's block into a sequential chunk of memory (L1 caching)
|
||||
// We pack the rhs's block into a sequential chunk of memory (L2 caching)
|
||||
// Note that this block will be read a very high number of times, which is equal to the number of
|
||||
// micro vertical panel of the large rhs's panel (e.g., cols/4 times).
|
||||
pack_lhs(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc);
|
||||
// micro horizontal panel of the large rhs's panel (e.g., rows/12 times).
|
||||
pack_rhs(blockB, &rhs(k2,j2), rhsStride, actual_kc, actual_nc);
|
||||
|
||||
// Everything is packed, we can now call the block * panel kernel:
|
||||
gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0);
|
||||
// Everything is packed, we can now call the panel * block kernel:
|
||||
gebp(res+j2*resStride, resStride, blockA, blockB, rows, actual_kc, actual_nc, alpha);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -199,14 +201,13 @@ struct traits<GeneralProduct<Lhs,Rhs,GemmProduct> >
|
||||
template<typename Scalar, typename Index, typename Gemm, typename Lhs, typename Rhs, typename Dest, typename BlockingType>
|
||||
struct gemm_functor
|
||||
{
|
||||
gemm_functor(const Lhs& lhs, const Rhs& rhs, Dest& dest, const Scalar& actualAlpha,
|
||||
BlockingType& blocking)
|
||||
gemm_functor(const Lhs& lhs, const Rhs& rhs, Dest& dest, const Scalar& actualAlpha, BlockingType& blocking)
|
||||
: m_lhs(lhs), m_rhs(rhs), m_dest(dest), m_actualAlpha(actualAlpha), m_blocking(blocking)
|
||||
{}
|
||||
|
||||
void initParallelSession() const
|
||||
{
|
||||
m_blocking.allocateB();
|
||||
m_blocking.allocateA();
|
||||
}
|
||||
|
||||
void operator() (Index row, Index rows, Index col=0, Index cols=-1, GemmParallelInfo<Index>* info=0) const
|
||||
@ -220,6 +221,8 @@ struct gemm_functor
|
||||
(Scalar*)&(m_dest.coeffRef(row,col)), m_dest.outerStride(),
|
||||
m_actualAlpha, m_blocking, info);
|
||||
}
|
||||
|
||||
typedef typename Gemm::Traits Traits;
|
||||
|
||||
protected:
|
||||
const Lhs& m_lhs;
|
||||
@ -316,13 +319,23 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
|
||||
|
||||
public:
|
||||
|
||||
gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth)
|
||||
gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth, bool full_rows = false)
|
||||
{
|
||||
this->m_mc = Transpose ? cols : rows;
|
||||
this->m_nc = Transpose ? rows : cols;
|
||||
this->m_kc = depth;
|
||||
|
||||
computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, this->m_nc);
|
||||
if(full_rows)
|
||||
{
|
||||
DenseIndex m = this->m_mc;
|
||||
computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, this->m_nc);
|
||||
}
|
||||
else // full columns
|
||||
{
|
||||
DenseIndex n = this->m_nc;
|
||||
computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, n);
|
||||
}
|
||||
|
||||
m_sizeA = this->m_mc * this->m_kc;
|
||||
m_sizeB = this->m_kc * this->m_nc;
|
||||
}
|
||||
@ -396,7 +409,7 @@ class GeneralProduct<Lhs, Rhs, GemmProduct>
|
||||
(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>,
|
||||
_ActualLhsType, _ActualRhsType, Dest, BlockingType> GemmFunctor;
|
||||
|
||||
BlockingType blocking(dst.rows(), dst.cols(), lhs.cols());
|
||||
BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), true);
|
||||
|
||||
internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), this->rows(), this->cols(), Dest::Flags&RowMajorBit);
|
||||
}
|
||||
|
@ -73,13 +73,13 @@ namespace internal {
|
||||
|
||||
template<typename Index> struct GemmParallelInfo
|
||||
{
|
||||
GemmParallelInfo() : sync(-1), users(0), rhs_start(0), rhs_length(0) {}
|
||||
GemmParallelInfo() : sync(-1), users(0), lhs_start(0), lhs_length(0) {}
|
||||
|
||||
int volatile sync;
|
||||
int volatile users;
|
||||
|
||||
Index rhs_start;
|
||||
Index rhs_length;
|
||||
Index lhs_start;
|
||||
Index lhs_length;
|
||||
};
|
||||
|
||||
template<bool Condition, typename Functor, typename Index>
|
||||
@ -107,7 +107,7 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos
|
||||
if((!Condition) || (omp_get_num_threads()>1))
|
||||
return func(0,rows, 0,cols);
|
||||
|
||||
Index size = transpose ? cols : rows;
|
||||
Index size = transpose ? rows : cols;
|
||||
|
||||
// 2- compute the maximal number of threads from the size of the product:
|
||||
// FIXME this has to be fine tuned
|
||||
@ -126,26 +126,25 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos
|
||||
std::swap(rows,cols);
|
||||
|
||||
Index blockCols = (cols / threads) & ~Index(0x3);
|
||||
Index blockRows = (rows / threads) & ~Index(0x7);
|
||||
Index blockRows = (rows / threads);
|
||||
blockRows = (blockRows/Functor::Traits::mr)*Functor::Traits::mr;
|
||||
|
||||
GemmParallelInfo<Index>* info = new GemmParallelInfo<Index>[threads];
|
||||
|
||||
#pragma omp parallel for schedule(static,1) num_threads(threads)
|
||||
for(Index i=0; i<threads; ++i)
|
||||
#pragma omp parallel num_threads(threads)
|
||||
{
|
||||
Index i = omp_get_thread_num();
|
||||
Index r0 = i*blockRows;
|
||||
Index actualBlockRows = (i+1==threads) ? rows-r0 : blockRows;
|
||||
|
||||
Index c0 = i*blockCols;
|
||||
Index actualBlockCols = (i+1==threads) ? cols-c0 : blockCols;
|
||||
|
||||
info[i].rhs_start = c0;
|
||||
info[i].rhs_length = actualBlockCols;
|
||||
info[i].lhs_start = r0;
|
||||
info[i].lhs_length = actualBlockRows;
|
||||
|
||||
if(transpose)
|
||||
func(0, cols, r0, actualBlockRows, info);
|
||||
else
|
||||
func(r0, actualBlockRows, 0,cols, info);
|
||||
if(transpose) func(c0, actualBlockCols, 0, rows, info);
|
||||
else func(0, rows, c0, actualBlockCols, info);
|
||||
}
|
||||
|
||||
delete[] info;
|
||||
|
@ -15,7 +15,7 @@ namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
// pack a selfadjoint block diagonal for use with the gebp_kernel
|
||||
template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder>
|
||||
template<typename Scalar, typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
|
||||
struct symm_pack_lhs
|
||||
{
|
||||
template<int BlockRows> inline
|
||||
@ -45,22 +45,29 @@ struct symm_pack_lhs
|
||||
}
|
||||
void operator()(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows)
|
||||
{
|
||||
enum { PacketSize = packet_traits<Scalar>::size };
|
||||
const_blas_data_mapper<Scalar,Index,StorageOrder> lhs(_lhs,lhsStride);
|
||||
Index count = 0;
|
||||
Index peeled_mc = (rows/Pack1)*Pack1;
|
||||
for(Index i=0; i<peeled_mc; i+=Pack1)
|
||||
{
|
||||
pack<Pack1>(blockA, lhs, cols, i, count);
|
||||
}
|
||||
|
||||
if(rows-peeled_mc>=Pack2)
|
||||
{
|
||||
pack<Pack2>(blockA, lhs, cols, peeled_mc, count);
|
||||
peeled_mc += Pack2;
|
||||
}
|
||||
//Index peeled_mc3 = (rows/Pack1)*Pack1;
|
||||
|
||||
const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
|
||||
const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
|
||||
const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
|
||||
|
||||
if(Pack1>=3*PacketSize)
|
||||
for(Index i=0; i<peeled_mc3; i+=3*PacketSize)
|
||||
pack<3*PacketSize>(blockA, lhs, cols, i, count);
|
||||
|
||||
if(Pack1>=2*PacketSize)
|
||||
for(Index i=peeled_mc3; i<peeled_mc2; i+=2*PacketSize)
|
||||
pack<2*PacketSize>(blockA, lhs, cols, i, count);
|
||||
|
||||
if(Pack1>=1*PacketSize)
|
||||
for(Index i=peeled_mc2; i<peeled_mc1; i+=1*PacketSize)
|
||||
pack<1*PacketSize>(blockA, lhs, cols, i, count);
|
||||
|
||||
// do the same with mr==1
|
||||
for(Index i=peeled_mc; i<rows; i++)
|
||||
for(Index i=peeled_mc1; i<rows; i++)
|
||||
{
|
||||
for(Index k=0; k<i; k++)
|
||||
blockA[count++] = lhs(i, k); // normal
|
||||
|
Loading…
Reference in New Issue
Block a user