mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-02-05 17:50:26 +08:00
some cleaning
This commit is contained in:
parent
6076173f0b
commit
c6d06c22ac
@ -78,9 +78,6 @@ static void run(int rows, int cols, int depth,
|
||||
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
|
||||
Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize);
|
||||
|
||||
// number of columns which can be processed by packet of nr columns
|
||||
int packet_cols = (cols/Blocking::nr) * Blocking::nr;
|
||||
|
||||
// => GEMM_VAR1
|
||||
for(int k2=0; k2<depth; k2+=kc)
|
||||
{
|
||||
@ -89,7 +86,7 @@ static void run(int rows, int cols, int depth,
|
||||
// we have selected one row panel of rhs and one column panel of lhs
|
||||
// pack rhs's panel into a sequential chunk of memory
|
||||
// and expand each coeff to a constant packet for further reuse
|
||||
ei_gemm_pack_rhs<Scalar, Blocking::nr, RhsStorageOrder>()(blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, packet_cols, cols);
|
||||
ei_gemm_pack_rhs<Scalar, Blocking::nr, RhsStorageOrder>()(blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, cols);
|
||||
|
||||
// => GEPP_VAR1
|
||||
for(int i2=0; i2<rows; i2+=mc)
|
||||
@ -99,7 +96,7 @@ static void run(int rows, int cols, int depth,
|
||||
ei_gemm_pack_lhs<Scalar, Blocking::mr, LhsStorageOrder>()(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc);
|
||||
|
||||
ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> >()
|
||||
(res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, i2, cols);
|
||||
(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols);
|
||||
}
|
||||
}
|
||||
|
||||
@ -113,19 +110,20 @@ static void run(int rows, int cols, int depth,
|
||||
template<typename Scalar, int mr, int nr, typename Conj>
|
||||
struct ei_gebp_kernel
|
||||
{
|
||||
void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int actual_mc, int actual_kc, int packet_cols, int i2, int cols)
|
||||
void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int rows, int depth, int cols)
|
||||
{
|
||||
typedef typename ei_packet_traits<Scalar>::type PacketType;
|
||||
enum { PacketSize = ei_packet_traits<Scalar>::size };
|
||||
Conj cj;
|
||||
const int peeled_mc = (actual_mc/mr)*mr;
|
||||
int packet_cols = (cols/nr) * nr;
|
||||
const int peeled_mc = (rows/mr)*mr;
|
||||
// loops on each cache friendly block of the result/rhs
|
||||
for(int j2=0; j2<packet_cols; j2+=nr)
|
||||
{
|
||||
// loops on each register blocking of lhs/res
|
||||
for(int i=0; i<peeled_mc; i+=mr)
|
||||
{
|
||||
const Scalar* blA = &blockA[i*actual_kc];
|
||||
const Scalar* blA = &blockA[i*depth];
|
||||
#ifdef EIGEN_VECTORIZE_SSE
|
||||
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
|
||||
#endif
|
||||
@ -134,20 +132,20 @@ struct ei_gebp_kernel
|
||||
|
||||
// gets res block as register
|
||||
PacketType C0, C1, C2, C3, C4, C5, C6, C7;
|
||||
C0 = ei_ploadu(&res[(j2+0)*resStride + i2 + i]);
|
||||
C1 = ei_ploadu(&res[(j2+1)*resStride + i2 + i]);
|
||||
if(nr==4) C2 = ei_ploadu(&res[(j2+2)*resStride + i2 + i]);
|
||||
if(nr==4) C3 = ei_ploadu(&res[(j2+3)*resStride + i2 + i]);
|
||||
C4 = ei_ploadu(&res[(j2+0)*resStride + i2 + i + PacketSize]);
|
||||
C5 = ei_ploadu(&res[(j2+1)*resStride + i2 + i + PacketSize]);
|
||||
if(nr==4) C6 = ei_ploadu(&res[(j2+2)*resStride + i2 + i + PacketSize]);
|
||||
if(nr==4) C7 = ei_ploadu(&res[(j2+3)*resStride + i2 + i + PacketSize]);
|
||||
C0 = ei_ploadu(&res[(j2+0)*resStride + i]);
|
||||
C1 = ei_ploadu(&res[(j2+1)*resStride + i]);
|
||||
if(nr==4) C2 = ei_ploadu(&res[(j2+2)*resStride + i]);
|
||||
if(nr==4) C3 = ei_ploadu(&res[(j2+3)*resStride + i]);
|
||||
C4 = ei_ploadu(&res[(j2+0)*resStride + i + PacketSize]);
|
||||
C5 = ei_ploadu(&res[(j2+1)*resStride + i + PacketSize]);
|
||||
if(nr==4) C6 = ei_ploadu(&res[(j2+2)*resStride + i + PacketSize]);
|
||||
if(nr==4) C7 = ei_ploadu(&res[(j2+3)*resStride + i + PacketSize]);
|
||||
|
||||
// performs "inner" product
|
||||
// TODO let's check wether the flowing peeled loop could not be
|
||||
// optimized via optimal prefetching from one loop to the other
|
||||
const Scalar* blB = &blockB[j2*actual_kc*PacketSize];
|
||||
const int peeled_kc = (actual_kc/4)*4;
|
||||
const Scalar* blB = &blockB[j2*depth*PacketSize];
|
||||
const int peeled_kc = (depth/4)*4;
|
||||
for(int k=0; k<peeled_kc; k+=4)
|
||||
{
|
||||
PacketType B0, B1, B2, B3, A0, A1;
|
||||
@ -214,7 +212,7 @@ struct ei_gebp_kernel
|
||||
blA += 4*mr;
|
||||
}
|
||||
// process remaining peeled loop
|
||||
for(int k=peeled_kc; k<actual_kc; k++)
|
||||
for(int k=peeled_kc; k<depth; k++)
|
||||
{
|
||||
PacketType B0, B1, B2, B3, A0, A1;
|
||||
|
||||
@ -237,26 +235,26 @@ struct ei_gebp_kernel
|
||||
blA += mr;
|
||||
}
|
||||
|
||||
ei_pstoreu(&res[(j2+0)*resStride + i2 + i], C0);
|
||||
ei_pstoreu(&res[(j2+1)*resStride + i2 + i], C1);
|
||||
if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i2 + i], C2);
|
||||
if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i2 + i], C3);
|
||||
ei_pstoreu(&res[(j2+0)*resStride + i2 + i + PacketSize], C4);
|
||||
ei_pstoreu(&res[(j2+1)*resStride + i2 + i + PacketSize], C5);
|
||||
if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i2 + i + PacketSize], C6);
|
||||
if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i2 + i + PacketSize], C7);
|
||||
ei_pstoreu(&res[(j2+0)*resStride + i], C0);
|
||||
ei_pstoreu(&res[(j2+1)*resStride + i], C1);
|
||||
if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i], C2);
|
||||
if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i], C3);
|
||||
ei_pstoreu(&res[(j2+0)*resStride + i + PacketSize], C4);
|
||||
ei_pstoreu(&res[(j2+1)*resStride + i + PacketSize], C5);
|
||||
if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i + PacketSize], C6);
|
||||
if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i + PacketSize], C7);
|
||||
}
|
||||
for(int i=peeled_mc; i<actual_mc; i++)
|
||||
for(int i=peeled_mc; i<rows; i++)
|
||||
{
|
||||
const Scalar* blA = &blockA[i*actual_kc];
|
||||
const Scalar* blA = &blockA[i*depth];
|
||||
#ifdef EIGEN_VECTORIZE_SSE
|
||||
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
|
||||
#endif
|
||||
|
||||
// gets a 1 x nr res block as registers
|
||||
Scalar C0(0), C1(0), C2(0), C3(0);
|
||||
const Scalar* blB = &blockB[j2*actual_kc*PacketSize];
|
||||
for(int k=0; k<actual_kc; k++)
|
||||
const Scalar* blB = &blockB[j2*depth*PacketSize];
|
||||
for(int k=0; k<depth; k++)
|
||||
{
|
||||
Scalar B0, B1, B2, B3, A0;
|
||||
|
||||
@ -272,10 +270,10 @@ struct ei_gebp_kernel
|
||||
|
||||
blB += nr*PacketSize;
|
||||
}
|
||||
res[(j2+0)*resStride + i2 + i] += C0;
|
||||
res[(j2+1)*resStride + i2 + i] += C1;
|
||||
if(nr==4) res[(j2+2)*resStride + i2 + i] += C2;
|
||||
if(nr==4) res[(j2+3)*resStride + i2 + i] += C3;
|
||||
res[(j2+0)*resStride + i] += C0;
|
||||
res[(j2+1)*resStride + i] += C1;
|
||||
if(nr==4) res[(j2+2)*resStride + i] += C2;
|
||||
if(nr==4) res[(j2+3)*resStride + i] += C3;
|
||||
}
|
||||
}
|
||||
|
||||
@ -285,7 +283,7 @@ struct ei_gebp_kernel
|
||||
{
|
||||
for(int i=0; i<peeled_mc; i+=mr)
|
||||
{
|
||||
const Scalar* blA = &blockA[i*actual_kc];
|
||||
const Scalar* blA = &blockA[i*depth];
|
||||
#ifdef EIGEN_VECTORIZE_SSE
|
||||
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
|
||||
#endif
|
||||
@ -294,11 +292,11 @@ struct ei_gebp_kernel
|
||||
|
||||
// gets res block as register
|
||||
PacketType C0, C4;
|
||||
C0 = ei_ploadu(&res[(j2+0)*resStride + i2 + i]);
|
||||
C4 = ei_ploadu(&res[(j2+0)*resStride + i2 + i + PacketSize]);
|
||||
C0 = ei_ploadu(&res[(j2+0)*resStride + i]);
|
||||
C4 = ei_ploadu(&res[(j2+0)*resStride + i + PacketSize]);
|
||||
|
||||
const Scalar* blB = &blockB[j2*actual_kc*PacketSize];
|
||||
for(int k=0; k<actual_kc; k++)
|
||||
const Scalar* blB = &blockB[j2*depth*PacketSize];
|
||||
for(int k=0; k<depth; k++)
|
||||
{
|
||||
PacketType B0, A0, A1;
|
||||
|
||||
@ -312,22 +310,22 @@ struct ei_gebp_kernel
|
||||
blA += mr;
|
||||
}
|
||||
|
||||
ei_pstoreu(&res[(j2+0)*resStride + i2 + i], C0);
|
||||
ei_pstoreu(&res[(j2+0)*resStride + i2 + i + PacketSize], C4);
|
||||
ei_pstoreu(&res[(j2+0)*resStride + i], C0);
|
||||
ei_pstoreu(&res[(j2+0)*resStride + i + PacketSize], C4);
|
||||
}
|
||||
for(int i=peeled_mc; i<actual_mc; i++)
|
||||
for(int i=peeled_mc; i<rows; i++)
|
||||
{
|
||||
const Scalar* blA = &blockA[i*actual_kc];
|
||||
const Scalar* blA = &blockA[i*depth];
|
||||
#ifdef EIGEN_VECTORIZE_SSE
|
||||
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
|
||||
#endif
|
||||
|
||||
// gets a 1 x 1 res block as registers
|
||||
Scalar C0(0);
|
||||
const Scalar* blB = &blockB[j2*actual_kc*PacketSize];
|
||||
for(int k=0; k<actual_kc; k++)
|
||||
const Scalar* blB = &blockB[j2*depth*PacketSize];
|
||||
for(int k=0; k<depth; k++)
|
||||
C0 = cj.pmadd(blA[k], blB[k*PacketSize], C0);
|
||||
res[(j2+0)*resStride + i2 + i] += C0;
|
||||
res[(j2+0)*resStride + i] += C0;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -350,19 +348,19 @@ struct ei_gebp_kernel
|
||||
template<typename Scalar, int mr, int StorageOrder, bool Conjugate>
|
||||
struct ei_gemm_pack_lhs
|
||||
{
|
||||
void operator()(Scalar* blockA, const EIGEN_RESTRICT Scalar* _lhs, int lhsStride, int actual_kc, int actual_mc)
|
||||
void operator()(Scalar* blockA, const EIGEN_RESTRICT Scalar* _lhs, int lhsStride, int depth, int rows)
|
||||
{
|
||||
ei_conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
|
||||
ei_const_blas_data_mapper<Scalar, StorageOrder> lhs(_lhs,lhsStride);
|
||||
int count = 0;
|
||||
const int peeled_mc = (actual_mc/mr)*mr;
|
||||
const int peeled_mc = (rows/mr)*mr;
|
||||
for(int i=0; i<peeled_mc; i+=mr)
|
||||
for(int k=0; k<actual_kc; k++)
|
||||
for(int k=0; k<depth; k++)
|
||||
for(int w=0; w<mr; w++)
|
||||
blockA[count++] = cj(lhs(i+w, k));
|
||||
for(int i=peeled_mc; i<actual_mc; i++)
|
||||
for(int i=peeled_mc; i<rows; i++)
|
||||
{
|
||||
for(int k=0; k<actual_kc; k++)
|
||||
for(int k=0; k<depth; k++)
|
||||
blockA[count++] = cj(lhs(i, k));
|
||||
}
|
||||
}
|
||||
@ -379,9 +377,10 @@ template<typename Scalar, int nr>
|
||||
struct ei_gemm_pack_rhs<Scalar, nr, ColMajor>
|
||||
{
|
||||
enum { PacketSize = ei_packet_traits<Scalar>::size };
|
||||
void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int actual_kc, int packet_cols, int cols)
|
||||
void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols)
|
||||
{
|
||||
bool hasAlpha = alpha != Scalar(1);
|
||||
int packet_cols = (cols/nr) * nr;
|
||||
int count = 0;
|
||||
for(int j2=0; j2<packet_cols; j2+=nr)
|
||||
{
|
||||
@ -391,7 +390,7 @@ struct ei_gemm_pack_rhs<Scalar, nr, ColMajor>
|
||||
const Scalar* b3 = &rhs[(j2+3)*rhsStride];
|
||||
if (hasAlpha)
|
||||
{
|
||||
for(int k=0; k<actual_kc; k++)
|
||||
for(int k=0; k<depth; k++)
|
||||
{
|
||||
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[k]));
|
||||
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*b1[k]));
|
||||
@ -405,7 +404,7 @@ struct ei_gemm_pack_rhs<Scalar, nr, ColMajor>
|
||||
}
|
||||
else
|
||||
{
|
||||
for(int k=0; k<actual_kc; k++)
|
||||
for(int k=0; k<depth; k++)
|
||||
{
|
||||
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[k]));
|
||||
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(b1[k]));
|
||||
@ -424,7 +423,7 @@ struct ei_gemm_pack_rhs<Scalar, nr, ColMajor>
|
||||
const Scalar* b0 = &rhs[(j2+0)*rhsStride];
|
||||
if (hasAlpha)
|
||||
{
|
||||
for(int k=0; k<actual_kc; k++)
|
||||
for(int k=0; k<depth; k++)
|
||||
{
|
||||
ei_pstore(&blockB[count], ei_pset1(alpha*b0[k]));
|
||||
count += PacketSize;
|
||||
@ -432,7 +431,7 @@ struct ei_gemm_pack_rhs<Scalar, nr, ColMajor>
|
||||
}
|
||||
else
|
||||
{
|
||||
for(int k=0; k<actual_kc; k++)
|
||||
for(int k=0; k<depth; k++)
|
||||
{
|
||||
ei_pstore(&blockB[count], ei_pset1(b0[k]));
|
||||
count += PacketSize;
|
||||
@ -447,15 +446,16 @@ template<typename Scalar, int nr>
|
||||
struct ei_gemm_pack_rhs<Scalar, nr, RowMajor>
|
||||
{
|
||||
enum { PacketSize = ei_packet_traits<Scalar>::size };
|
||||
void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int actual_kc, int packet_cols, int cols)
|
||||
void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols)
|
||||
{
|
||||
bool hasAlpha = alpha != Scalar(1);
|
||||
int packet_cols = (cols/nr) * nr;
|
||||
int count = 0;
|
||||
for(int j2=0; j2<packet_cols; j2+=nr)
|
||||
{
|
||||
if (hasAlpha)
|
||||
{
|
||||
for(int k=0; k<actual_kc; k++)
|
||||
for(int k=0; k<depth; k++)
|
||||
{
|
||||
const Scalar* b0 = &rhs[k*rhsStride + j2];
|
||||
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[0]));
|
||||
@ -470,7 +470,7 @@ struct ei_gemm_pack_rhs<Scalar, nr, RowMajor>
|
||||
}
|
||||
else
|
||||
{
|
||||
for(int k=0; k<actual_kc; k++)
|
||||
for(int k=0; k<depth; k++)
|
||||
{
|
||||
const Scalar* b0 = &rhs[k*rhsStride + j2];
|
||||
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[0]));
|
||||
@ -488,7 +488,7 @@ struct ei_gemm_pack_rhs<Scalar, nr, RowMajor>
|
||||
for(int j2=packet_cols; j2<cols; ++j2)
|
||||
{
|
||||
const Scalar* b0 = &rhs[j2];
|
||||
for(int k=0; k<actual_kc; k++)
|
||||
for(int k=0; k<depth; k++)
|
||||
{
|
||||
ei_pstore(&blockB[count], ei_pset1(alpha*b0[k*rhsStride]));
|
||||
count += PacketSize;
|
||||
|
@ -29,11 +29,11 @@
|
||||
template<typename Scalar, int mr, int StorageOrder>
|
||||
struct ei_symm_pack_lhs
|
||||
{
|
||||
void operator()(Scalar* blockA, const Scalar* _lhs, int lhsStride, int actual_kc, int actual_mc)
|
||||
void operator()(Scalar* blockA, const Scalar* _lhs, int lhsStride, int cols, int rows)
|
||||
{
|
||||
ei_const_blas_data_mapper<Scalar,StorageOrder> lhs(_lhs,lhsStride);
|
||||
int count = 0;
|
||||
const int peeled_mc = (actual_mc/mr)*mr;
|
||||
const int peeled_mc = (rows/mr)*mr;
|
||||
for(int i=0; i<peeled_mc; i+=mr)
|
||||
{
|
||||
// normal copy
|
||||
@ -51,17 +51,17 @@ struct ei_symm_pack_lhs
|
||||
++h;
|
||||
}
|
||||
// transposed copy
|
||||
for(int k=i+mr; k<actual_kc; k++)
|
||||
for(int k=i+mr; k<cols; k++)
|
||||
for(int w=0; w<mr; w++)
|
||||
blockA[count++] = ei_conj(lhs(k, i+w)); // transposed
|
||||
}
|
||||
|
||||
// do the same with mr==1
|
||||
for(int i=peeled_mc; i<actual_mc; i++)
|
||||
for(int i=peeled_mc; i<rows; i++)
|
||||
{
|
||||
for(int k=0; k<=i; k++)
|
||||
blockA[count++] = lhs(i, k); // normal
|
||||
for(int k=i+1; k<actual_kc; k++)
|
||||
for(int k=i+1; k<cols; k++)
|
||||
blockA[count++] = ei_conj(lhs(k, i)); // transposed
|
||||
}
|
||||
}
|
||||
@ -71,11 +71,12 @@ template<typename Scalar, int nr, int StorageOrder>
|
||||
struct ei_symm_pack_rhs
|
||||
{
|
||||
enum { PacketSize = ei_packet_traits<Scalar>::size };
|
||||
void operator()(Scalar* blockB, const Scalar* _rhs, int rhsStride, Scalar alpha, int actual_kc, int packet_cols, int cols, int k2)
|
||||
void operator()(Scalar* blockB, const Scalar* _rhs, int rhsStride, Scalar alpha, int rows, int cols, int k2)
|
||||
{
|
||||
int end_k = k2 + actual_kc;
|
||||
int end_k = k2 + rows;
|
||||
int count = 0;
|
||||
ei_const_blas_data_mapper<Scalar,StorageOrder> rhs(_rhs,rhsStride);
|
||||
int packet_cols = (cols/nr)*nr;
|
||||
|
||||
// first part: normal case
|
||||
for(int j2=0; j2<k2; j2+=nr)
|
||||
@ -94,7 +95,7 @@ struct ei_symm_pack_rhs
|
||||
}
|
||||
|
||||
// second part: diagonal block
|
||||
for(int j2=k2; j2<std::min(k2+actual_kc,packet_cols); j2+=nr)
|
||||
for(int j2=k2; j2<std::min(k2+rows,packet_cols); j2+=nr)
|
||||
{
|
||||
// again we can split vertically in three different parts (transpose, symmetric, normal)
|
||||
// transpose
|
||||
@ -137,7 +138,7 @@ struct ei_symm_pack_rhs
|
||||
}
|
||||
|
||||
// third part: transposed
|
||||
for(int j2=k2+actual_kc; j2<packet_cols; j2+=nr)
|
||||
for(int j2=k2+rows; j2<packet_cols; j2+=nr)
|
||||
{
|
||||
for(int k=k2; k<end_k; k++)
|
||||
{
|
||||
@ -163,7 +164,7 @@ struct ei_symm_pack_rhs
|
||||
count += PacketSize;
|
||||
}
|
||||
// normal
|
||||
for(int k=half; k<k2+actual_kc; k++)
|
||||
for(int k=half; k<k2+rows; k++)
|
||||
{
|
||||
ei_pstore(&blockB[count], ei_pset1(alpha*rhs(k,j2)));
|
||||
count += PacketSize;
|
||||
@ -233,9 +234,6 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,true,ConjugateLhs, R
|
||||
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
|
||||
Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize);
|
||||
|
||||
// number of columns which can be processed by packet of nr columns
|
||||
int packet_cols = (cols/Blocking::nr)*Blocking::nr;
|
||||
|
||||
ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> > gebp_kernel;
|
||||
|
||||
for(int k2=0; k2<size; k2+=kc)
|
||||
@ -246,7 +244,7 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,true,ConjugateLhs, R
|
||||
// pack rhs's panel into a sequential chunk of memory
|
||||
// and expand each coeff to a constant packet for further reuse
|
||||
ei_gemm_pack_rhs<Scalar,Blocking::nr,RhsStorageOrder>()
|
||||
(blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, packet_cols, cols);
|
||||
(blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, cols);
|
||||
|
||||
// the select lhs's panel has to be split in three different parts:
|
||||
// 1 - the transposed panel above the diagonal block => transposed packed copy
|
||||
@ -259,7 +257,7 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,true,ConjugateLhs, R
|
||||
ei_gemm_pack_lhs<Scalar,Blocking::mr,LhsStorageOrder==RowMajor?ColMajor:RowMajor, true>()
|
||||
(blockA, &lhs(k2, i2), lhsStride, actual_kc, actual_mc);
|
||||
|
||||
gebp_kernel(res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, i2, cols);
|
||||
gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols);
|
||||
}
|
||||
// the block diagonal
|
||||
{
|
||||
@ -268,7 +266,7 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,true,ConjugateLhs, R
|
||||
ei_symm_pack_lhs<Scalar,Blocking::mr,LhsStorageOrder>()
|
||||
(blockA, &lhs(k2,k2), lhsStride, actual_kc, actual_mc);
|
||||
|
||||
gebp_kernel(res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, k2, cols);
|
||||
gebp_kernel(res+k2, resStride, blockA, blockB, actual_mc, actual_kc, cols);
|
||||
}
|
||||
|
||||
for(int i2=k2+kc; i2<size; i2+=mc)
|
||||
@ -277,7 +275,7 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,true,ConjugateLhs, R
|
||||
ei_gemm_pack_lhs<Scalar,Blocking::mr,LhsStorageOrder,false>()
|
||||
(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
|
||||
|
||||
gebp_kernel(res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, i2, cols);
|
||||
gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols);
|
||||
}
|
||||
}
|
||||
|
||||
@ -316,9 +314,6 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,false,ConjugateLhs,
|
||||
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
|
||||
Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize);
|
||||
|
||||
// number of columns which can be processed by packet of nr columns
|
||||
int packet_cols = (cols/Blocking::nr)*Blocking::nr;
|
||||
|
||||
ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> > gebp_kernel;
|
||||
|
||||
for(int k2=0; k2<size; k2+=kc)
|
||||
@ -326,7 +321,7 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,false,ConjugateLhs,
|
||||
const int actual_kc = std::min(k2+kc,size)-k2;
|
||||
|
||||
ei_symm_pack_rhs<Scalar,Blocking::nr,RhsStorageOrder>()
|
||||
(blockB, _rhs, rhsStride, alpha, actual_kc, packet_cols, cols, k2);
|
||||
(blockB, _rhs, rhsStride, alpha, actual_kc, cols, k2);
|
||||
|
||||
// => GEPP
|
||||
for(int i2=0; i2<rows; i2+=mc)
|
||||
@ -335,7 +330,7 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,false,ConjugateLhs,
|
||||
ei_gemm_pack_lhs<Scalar,Blocking::mr,LhsStorageOrder>()
|
||||
(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
|
||||
|
||||
gebp_kernel(res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, i2, cols);
|
||||
gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -70,7 +70,7 @@ struct ei_selfadjoint_product<Scalar,MatStorageOrder, ColMajor, AAT, UpLo>
|
||||
|
||||
typedef ei_product_blocking_traits<Scalar> Blocking;
|
||||
|
||||
int kc = std::min<int>(Blocking::Max_kc,depth); // cache block size along the K direction
|
||||
int kc = std::min<int>(Blocking::Max_kc,depth); // cache block size along the K direction
|
||||
int mc = std::min<int>(Blocking::Max_mc,size); // cache block size along the M direction
|
||||
|
||||
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
|
||||
@ -90,7 +90,7 @@ struct ei_selfadjoint_product<Scalar,MatStorageOrder, ColMajor, AAT, UpLo>
|
||||
|
||||
// note that the actual rhs is the transpose/adjoint of mat
|
||||
ei_gemm_pack_rhs<Scalar,Blocking::nr,MatStorageOrder==RowMajor ? ColMajor : RowMajor>()
|
||||
(blockB, &mat(0,k2), matStride, alpha, actual_kc, packet_cols, size);
|
||||
(blockB, &mat(0,k2), matStride, alpha, actual_kc, size);
|
||||
|
||||
for(int i2=0; i2<size; i2+=mc)
|
||||
{
|
||||
@ -104,16 +104,15 @@ struct ei_selfadjoint_product<Scalar,MatStorageOrder, ColMajor, AAT, UpLo>
|
||||
// 2 - the actual_mc x actual_mc symmetric block => processed with a special kernel
|
||||
// 3 - after the diagonal => processed with gebp or skipped
|
||||
if (UpLo==LowerTriangular)
|
||||
gebp_kernel(res, resStride, blockA, blockB, actual_mc, actual_kc, std::min(packet_cols,i2), i2, std::min(size,i2));
|
||||
gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, std::min(size,i2));
|
||||
|
||||
ei_sybb_kernel<Scalar, Blocking::mr, Blocking::nr, Conj, UpLo>()
|
||||
(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*Blocking::PacketSize*i2, actual_mc, actual_kc, std::min(actual_mc,std::max(packet_cols-i2,0)));
|
||||
(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*Blocking::PacketSize*i2, actual_mc, actual_kc);
|
||||
|
||||
if (UpLo==UpperTriangular)
|
||||
{
|
||||
int j2 = i2+actual_mc;
|
||||
gebp_kernel(res+resStride*j2, resStride, blockA, blockB+actual_kc*Blocking::PacketSize*j2, actual_mc, actual_kc,
|
||||
std::max(0,packet_cols-j2), i2, std::max(0,size-j2));
|
||||
gebp_kernel(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*Blocking::PacketSize*j2, actual_mc, actual_kc, std::max(0,size-j2));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -149,323 +148,57 @@ SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
|
||||
}
|
||||
|
||||
|
||||
// optimized SYmmetric packed Block * packed Block product kernel
|
||||
// Optimized SYmmetric packed Block * packed Block product kernel.
|
||||
// This kernel is built on top of the gebp kernel:
|
||||
// - the current selfadjoint block (res) is processed per panel of actual_mc x BlockSize
|
||||
// where BlockSize is set to the minimal value allowing gebp to be as fast as possible
|
||||
// - then, as usual, each panel is split into three parts along the diagonal,
|
||||
// the sub blocks above and below the diagonal are processed as usual,
|
||||
// while the selfadjoint block overlapping the diagonal is evaluated into a
|
||||
// small temporary buffer which is then accumulated into the result using a
|
||||
// triangular traversal.
|
||||
template<typename Scalar, int mr, int nr, typename Conj, int UpLo>
|
||||
struct ei_sybb_kernel
|
||||
{
|
||||
enum {
|
||||
PacketSize = ei_packet_traits<Scalar>::size,
|
||||
BlockSize = EIGEN_ENUM_MAX(mr,nr)
|
||||
BlockSize = EIGEN_ENUM_MAX(mr,nr)
|
||||
};
|
||||
void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int actual_mc, int actual_kc, int packet_cols)
|
||||
void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int size, int depth)
|
||||
{
|
||||
ei_gebp_kernel<Scalar, mr, nr, Conj> gebp_kernel;
|
||||
Matrix<Scalar,BlockSize,BlockSize,ColMajor> buffer;
|
||||
|
||||
|
||||
// let's process the block per panel of actual_mc x BlockSize,
|
||||
// again, each is split into three parts, etc.
|
||||
for (int j=0; j<actual_mc; j+=BlockSize)
|
||||
for (int j=0; j<size; j+=BlockSize)
|
||||
{
|
||||
int actualBlockSize = std::min(BlockSize,actual_mc - j);
|
||||
int actualPacketCols = std::min(actualBlockSize,std::max(packet_cols-j,0));
|
||||
const Scalar* actual_b = blockB+j*actual_kc*PacketSize;
|
||||
|
||||
int actualBlockSize = std::min<int>(BlockSize,size - j);
|
||||
const Scalar* actual_b = blockB+j*depth*PacketSize;
|
||||
|
||||
if(UpLo==UpperTriangular)
|
||||
gebp_kernel(res+j*resStride, resStride, blockA, actual_b,
|
||||
j, actual_kc, actualPacketCols, 0, actualBlockSize);
|
||||
gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize);
|
||||
|
||||
// selfadjoint micro block
|
||||
// => use the gebp kernel on a temporary buffer,
|
||||
// and then perform a triangular copy
|
||||
{
|
||||
int i = j;
|
||||
buffer.setZero();
|
||||
gebp_kernel(buffer.data(), BlockSize, blockA+actual_kc*i, actual_b,
|
||||
actualBlockSize, actual_kc, actualPacketCols, 0, actualBlockSize);
|
||||
// 1 - apply the kernel on the temporary buffer
|
||||
gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize);
|
||||
// 2 - triangular accumulation
|
||||
for(int j1=0; j1<actualBlockSize; ++j1)
|
||||
{
|
||||
Scalar* r = res + (j+j1)*resStride;
|
||||
Scalar* r = res + (j+j1)*resStride + i;
|
||||
for(int i1=UpLo==LowerTriangular ? j1 : 0;
|
||||
UpLo==LowerTriangular ? i1<actualBlockSize : i1<=j1; ++j1)
|
||||
r[i1+j1*resStride] = buffer(i1,j1);
|
||||
UpLo==LowerTriangular ? i1<actualBlockSize : i1<=j1; ++i1)
|
||||
r[i1] += buffer(i1,j1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if(UpLo==LowerTriangular)
|
||||
{
|
||||
int i = j+actualBlockSize;
|
||||
if(i<actual_mc)
|
||||
gebp_kernel(res+j*resStride+i, resStride, blockA+actual_kc*i, actual_b,
|
||||
actual_mc-i, actual_kc, actualPacketCols, 0, actualBlockSize);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// optimized SYmmetric packed Block * packed Block product kernel
|
||||
// this kernel is very similar to the gebp kernel: the only differences are
|
||||
// the piece of code to avoid the writes off the diagonal
|
||||
// => TODO find a way to factorize the two kernels in a single one
|
||||
template<typename Scalar, int mr, int nr, typename Conj, int UpLo>
|
||||
struct ei_sybb_kernel_
|
||||
{
|
||||
void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int actual_mc, int actual_kc, int packet_cols)
|
||||
{
|
||||
typedef typename ei_packet_traits<Scalar>::type PacketType;
|
||||
enum { PacketSize = ei_packet_traits<Scalar>::size };
|
||||
Conj cj;
|
||||
const int peeled_mc = (actual_mc/mr)*mr;
|
||||
// loops on each cache friendly block of the result/rhs
|
||||
for(int j2=0; j2<packet_cols; j2+=nr)
|
||||
{
|
||||
// here we selected a vertical mc x nr panel of the result that we'll
|
||||
// process normally until the end of the diagonal (or from the start if upper)
|
||||
//
|
||||
int start_i = UpLo==LowerTriangular ? (j2/mr)*mr : 0;
|
||||
int end_i = UpLo==LowerTriangular ? actual_mc : std::min(actual_mc,((j2+std::max(mr,nr))/mr)*mr);
|
||||
for(int i=start_i; i<std::min(peeled_mc,end_i); i+=mr)
|
||||
{
|
||||
const Scalar* blA = &blockA[i*actual_kc];
|
||||
#ifdef EIGEN_VECTORIZE_SSE
|
||||
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
|
||||
#endif
|
||||
|
||||
// TODO move the res loads to the stores
|
||||
|
||||
// gets res block as register
|
||||
PacketType C0, C1, C2, C3, C4, C5, C6, C7;
|
||||
C0 = ei_ploadu(&res[(j2+0)*resStride + i]);
|
||||
C1 = ei_ploadu(&res[(j2+1)*resStride + i]);
|
||||
if(nr==4) C2 = ei_ploadu(&res[(j2+2)*resStride + i]);
|
||||
if(nr==4) C3 = ei_ploadu(&res[(j2+3)*resStride + i]);
|
||||
C4 = ei_ploadu(&res[(j2+0)*resStride + i + PacketSize]);
|
||||
C5 = ei_ploadu(&res[(j2+1)*resStride + i + PacketSize]);
|
||||
if(nr==4) C6 = ei_ploadu(&res[(j2+2)*resStride + i + PacketSize]);
|
||||
if(nr==4) C7 = ei_ploadu(&res[(j2+3)*resStride + i + PacketSize]);
|
||||
|
||||
// performs "inner" product
|
||||
// TODO let's check wether the flowing peeled loop could not be
|
||||
// optimized via optimal prefetching from one loop to the other
|
||||
const Scalar* blB = &blockB[j2*actual_kc*PacketSize];
|
||||
const int peeled_kc = (actual_kc/4)*4;
|
||||
for(int k=0; k<peeled_kc; k+=4)
|
||||
{
|
||||
PacketType B0, B1, B2, B3, A0, A1;
|
||||
|
||||
A0 = ei_pload(&blA[0*PacketSize]);
|
||||
A1 = ei_pload(&blA[1*PacketSize]);
|
||||
B0 = ei_pload(&blB[0*PacketSize]);
|
||||
B1 = ei_pload(&blB[1*PacketSize]);
|
||||
C0 = cj.pmadd(A0, B0, C0);
|
||||
if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
|
||||
C4 = cj.pmadd(A1, B0, C4);
|
||||
if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
|
||||
B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]);
|
||||
C1 = cj.pmadd(A0, B1, C1);
|
||||
C5 = cj.pmadd(A1, B1, C5);
|
||||
B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]);
|
||||
if(nr==4) C2 = cj.pmadd(A0, B2, C2);
|
||||
if(nr==4) C6 = cj.pmadd(A1, B2, C6);
|
||||
if(nr==4) B2 = ei_pload(&blB[6*PacketSize]);
|
||||
if(nr==4) C3 = cj.pmadd(A0, B3, C3);
|
||||
A0 = ei_pload(&blA[2*PacketSize]);
|
||||
if(nr==4) C7 = cj.pmadd(A1, B3, C7);
|
||||
A1 = ei_pload(&blA[3*PacketSize]);
|
||||
if(nr==4) B3 = ei_pload(&blB[7*PacketSize]);
|
||||
C0 = cj.pmadd(A0, B0, C0);
|
||||
C4 = cj.pmadd(A1, B0, C4);
|
||||
B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]);
|
||||
C1 = cj.pmadd(A0, B1, C1);
|
||||
C5 = cj.pmadd(A1, B1, C5);
|
||||
B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]);
|
||||
if(nr==4) C2 = cj.pmadd(A0, B2, C2);
|
||||
if(nr==4) C6 = cj.pmadd(A1, B2, C6);
|
||||
if(nr==4) B2 = ei_pload(&blB[10*PacketSize]);
|
||||
if(nr==4) C3 = cj.pmadd(A0, B3, C3);
|
||||
A0 = ei_pload(&blA[4*PacketSize]);
|
||||
if(nr==4) C7 = cj.pmadd(A1, B3, C7);
|
||||
A1 = ei_pload(&blA[5*PacketSize]);
|
||||
if(nr==4) B3 = ei_pload(&blB[11*PacketSize]);
|
||||
|
||||
C0 = cj.pmadd(A0, B0, C0);
|
||||
C4 = cj.pmadd(A1, B0, C4);
|
||||
B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]);
|
||||
C1 = cj.pmadd(A0, B1, C1);
|
||||
C5 = cj.pmadd(A1, B1, C5);
|
||||
B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]);
|
||||
if(nr==4) C2 = cj.pmadd(A0, B2, C2);
|
||||
if(nr==4) C6 = cj.pmadd(A1, B2, C6);
|
||||
if(nr==4) B2 = ei_pload(&blB[14*PacketSize]);
|
||||
if(nr==4) C3 = cj.pmadd(A0, B3, C3);
|
||||
A0 = ei_pload(&blA[6*PacketSize]);
|
||||
if(nr==4) C7 = cj.pmadd(A1, B3, C7);
|
||||
A1 = ei_pload(&blA[7*PacketSize]);
|
||||
if(nr==4) B3 = ei_pload(&blB[15*PacketSize]);
|
||||
C0 = cj.pmadd(A0, B0, C0);
|
||||
C4 = cj.pmadd(A1, B0, C4);
|
||||
C1 = cj.pmadd(A0, B1, C1);
|
||||
C5 = cj.pmadd(A1, B1, C5);
|
||||
if(nr==4) C2 = cj.pmadd(A0, B2, C2);
|
||||
if(nr==4) C6 = cj.pmadd(A1, B2, C6);
|
||||
if(nr==4) C3 = cj.pmadd(A0, B3, C3);
|
||||
if(nr==4) C7 = cj.pmadd(A1, B3, C7);
|
||||
|
||||
blB += 4*nr*PacketSize;
|
||||
blA += 4*mr;
|
||||
}
|
||||
// process remaining peeled loop
|
||||
for(int k=peeled_kc; k<actual_kc; k++)
|
||||
{
|
||||
PacketType B0, B1, B2, B3, A0, A1;
|
||||
|
||||
A0 = ei_pload(&blA[0*PacketSize]);
|
||||
A1 = ei_pload(&blA[1*PacketSize]);
|
||||
B0 = ei_pload(&blB[0*PacketSize]);
|
||||
B1 = ei_pload(&blB[1*PacketSize]);
|
||||
C0 = cj.pmadd(A0, B0, C0);
|
||||
if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
|
||||
C4 = cj.pmadd(A1, B0, C4);
|
||||
if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
|
||||
C1 = cj.pmadd(A0, B1, C1);
|
||||
C5 = cj.pmadd(A1, B1, C5);
|
||||
if(nr==4) C2 = cj.pmadd(A0, B2, C2);
|
||||
if(nr==4) C6 = cj.pmadd(A1, B2, C6);
|
||||
if(nr==4) C3 = cj.pmadd(A0, B3, C3);
|
||||
if(nr==4) C7 = cj.pmadd(A1, B3, C7);
|
||||
|
||||
blB += nr*PacketSize;
|
||||
blA += mr;
|
||||
}
|
||||
|
||||
// let's check whether the mr x nr block overlap the diagonal,
|
||||
// is so then we have to carefully discard writes off the diagonal
|
||||
if(UpLo==LowerTriangular ? i>=j2+nr : i+mr<=j2)
|
||||
{
|
||||
ei_pstoreu(&res[(j2+0)*resStride + i], C0);
|
||||
ei_pstoreu(&res[(j2+1)*resStride + i], C1);
|
||||
if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i], C2);
|
||||
if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i], C3);
|
||||
ei_pstoreu(&res[(j2+0)*resStride + i + PacketSize], C4);
|
||||
ei_pstoreu(&res[(j2+1)*resStride + i + PacketSize], C5);
|
||||
if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i + PacketSize], C6);
|
||||
if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i + PacketSize], C7);
|
||||
}
|
||||
else
|
||||
{
|
||||
Scalar buf[mr*nr];
|
||||
// overlap => copy to a temporary mr x nr buffer and then triangular copy
|
||||
ei_pstore(&buf[0*mr], C0);
|
||||
ei_pstore(&buf[1*mr], C1);
|
||||
if(nr==4) ei_pstore(&buf[2*mr], C2);
|
||||
if(nr==4) ei_pstore(&buf[3*mr], C3);
|
||||
ei_pstore(&buf[0*mr + PacketSize], C4);
|
||||
ei_pstore(&buf[1*mr + PacketSize], C5);
|
||||
if(nr==4) ei_pstore(&buf[2*mr + PacketSize], C6);
|
||||
if(nr==4) ei_pstore(&buf[3*mr + PacketSize], C7);
|
||||
|
||||
for(int j1=0; j1<nr; ++j1)
|
||||
for(int i1=0; i1<mr; ++i1)
|
||||
{
|
||||
if(UpLo==LowerTriangular ? i+i1 >= j2+j1 : i+i1 <= j2+j1)
|
||||
res[(j2+j1)*resStride + i+i1] = buf[i1 + j1 * mr];
|
||||
}
|
||||
}
|
||||
}
|
||||
for(int i=std::max(start_i,peeled_mc); i<std::min(end_i,actual_mc); i++)
|
||||
{
|
||||
const Scalar* blA = &blockA[i*actual_kc];
|
||||
#ifdef EIGEN_VECTORIZE_SSE
|
||||
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
|
||||
#endif
|
||||
|
||||
// gets a 1 x nr res block as registers
|
||||
Scalar C0(0), C1(0), C2(0), C3(0);
|
||||
const Scalar* blB = &blockB[j2*actual_kc*PacketSize];
|
||||
for(int k=0; k<actual_kc; k++)
|
||||
{
|
||||
Scalar B0, B1, B2, B3, A0;
|
||||
|
||||
A0 = blA[k];
|
||||
B0 = blB[0*PacketSize];
|
||||
B1 = blB[1*PacketSize];
|
||||
C0 = cj.pmadd(A0, B0, C0);
|
||||
if(nr==4) B2 = blB[2*PacketSize];
|
||||
if(nr==4) B3 = blB[3*PacketSize];
|
||||
C1 = cj.pmadd(A0, B1, C1);
|
||||
if(nr==4) C2 = cj.pmadd(A0, B2, C2);
|
||||
if(nr==4) C3 = cj.pmadd(A0, B3, C3);
|
||||
|
||||
blB += nr*PacketSize;
|
||||
}
|
||||
if(UpLo==LowerTriangular ? i>=j2+nr : i+mr<=j2) {
|
||||
res[(j2+0)*resStride + i] += C0;
|
||||
res[(j2+1)*resStride + i] += C1;
|
||||
if(nr==4) res[(j2+2)*resStride + i] += C2;
|
||||
if(nr==4) res[(j2+3)*resStride + i] += C3;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(UpLo==LowerTriangular ? i>=j2+0 : i<=j2+0) res[(j2+0)*resStride + i] += C0;
|
||||
if(UpLo==LowerTriangular ? i>=j2+1 : i<=j2+1) res[(j2+1)*resStride + i] += C1;
|
||||
if(nr==4) if(UpLo==LowerTriangular ? i>=j2+2 : i<=j2+2) res[(j2+2)*resStride + i] += C2;
|
||||
if(nr==4) if(UpLo==LowerTriangular ? i>=j2+3 : i<=j2+3) res[(j2+3)*resStride + i] += C3;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// process remaining rhs/res columns one at a time
|
||||
// => do the same but with nr==1
|
||||
for(int j2=packet_cols; j2<actual_mc; j2++)
|
||||
{
|
||||
int start_i = UpLo==LowerTriangular ? (j2/mr)*mr : 0;
|
||||
int end_i = UpLo==LowerTriangular ? actual_mc : std::min(actual_mc,j2+1);
|
||||
for(int i=start_i; i<std::min(end_i,peeled_mc); i+=mr)
|
||||
{
|
||||
const Scalar* blA = &blockA[i*actual_kc];
|
||||
#ifdef EIGEN_VECTORIZE_SSE
|
||||
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
|
||||
#endif
|
||||
|
||||
// TODO move the res loads to the stores
|
||||
|
||||
// gets res block as register
|
||||
PacketType C0, C4;
|
||||
C0 = ei_ploadu(&res[(j2+0)*resStride + i]);
|
||||
C4 = ei_ploadu(&res[(j2+0)*resStride + i + PacketSize]);
|
||||
|
||||
const Scalar* blB = &blockB[j2*actual_kc*PacketSize];
|
||||
for(int k=0; k<actual_kc; k++)
|
||||
{
|
||||
PacketType B0, A0, A1;
|
||||
|
||||
A0 = ei_pload(&blA[0*PacketSize]);
|
||||
A1 = ei_pload(&blA[1*PacketSize]);
|
||||
B0 = ei_pload(&blB[0*PacketSize]);
|
||||
C0 = cj.pmadd(A0, B0, C0);
|
||||
C4 = cj.pmadd(A1, B0, C4);
|
||||
|
||||
blB += PacketSize;
|
||||
blA += mr;
|
||||
}
|
||||
|
||||
if(UpLo==LowerTriangular ? i>=j2 : i<=j2) ei_pstoreu(&res[(j2+0)*resStride + i], C0);
|
||||
if(UpLo==LowerTriangular ? i+PacketSize>=j2 : i+PacketSize<=j2) ei_pstoreu(&res[(j2+0)*resStride + i + PacketSize], C4);
|
||||
}
|
||||
if(UpLo==LowerTriangular)
|
||||
start_i = j2;
|
||||
for(int i=std::max(start_i,peeled_mc); i<std::min(end_i,actual_mc); i++)
|
||||
{
|
||||
const Scalar* blA = &blockA[i*actual_kc];
|
||||
#ifdef EIGEN_VECTORIZE_SSE
|
||||
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
|
||||
#endif
|
||||
|
||||
// gets a 1 x 1 res block as registers
|
||||
Scalar C0(0);
|
||||
const Scalar* blB = &blockB[j2*actual_kc*PacketSize];
|
||||
for(int k=0; k<actual_kc; k++)
|
||||
C0 = cj.pmadd(blA[k], blB[k*PacketSize], C0);
|
||||
res[(j2+0)*resStride + i] += C0;
|
||||
gebp_kernel(res+j*resStride+i, resStride, blockA+depth*i, actual_b, size-i, depth, actualBlockSize);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user