mirror of
https://gitlab.com/libeigen/eigen.git
synced 2024-12-15 07:10:37 +08:00
performance improvement: rewrite of the matrix-matrix product following
Goto's paper => x1.4 speedup with more consistent perf results
This commit is contained in:
parent
8ed186b9ab
commit
ed134a0ce5
@ -69,6 +69,286 @@ static void ei_cache_friendly_product(
|
||||
|
||||
typedef typename ei_packet_traits<Scalar>::type PacketType;
|
||||
|
||||
|
||||
|
||||
#ifndef EIGEN_USE_ALT_PRODUCT
|
||||
|
||||
enum {
|
||||
PacketSize = sizeof(PacketType)/sizeof(Scalar),
|
||||
#if (defined __i386__)
|
||||
HalfRegisterCount = 4,
|
||||
#else
|
||||
HalfRegisterCount = 8,
|
||||
#endif
|
||||
|
||||
// register block size along the N direction
|
||||
nr = HalfRegisterCount/2,
|
||||
|
||||
// register block size along the M direction
|
||||
mr = 2 * PacketSize,
|
||||
|
||||
// max cache block size along the K direction
|
||||
Max_kc = ei_L2_block_traits<EIGEN_TUNE_FOR_CPU_CACHE_SIZE,Scalar>::width,
|
||||
|
||||
// max cache block size along the M direction
|
||||
Max_mc = 2*Max_kc
|
||||
};
|
||||
|
||||
int kc = std::min<int>(Max_kc,depth); // cache block size along the K direction
|
||||
int mc = std::min<int>(Max_mc,rows); // cache block size along the M direction
|
||||
|
||||
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
|
||||
Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*PacketSize);
|
||||
|
||||
// number of columns which can be processed by packet of nr columns
|
||||
int packet_cols = (cols/nr)*nr;
|
||||
|
||||
// GEMM_VAR1
|
||||
for(int k2=0; k2<depth; k2+=kc)
|
||||
{
|
||||
const int actual_kc = std::min(k2+kc,depth)-k2;
|
||||
|
||||
// we have selected one row panel of rhs and one column panel of lhs
|
||||
// pack rhs's panel into a sequential chunk of memory
|
||||
// and expand each coeff to a constant packet for further reuse
|
||||
{
|
||||
int count = 0;
|
||||
for(int j2=0; j2<packet_cols; j2+=nr)
|
||||
{
|
||||
const Scalar* b0 = &rhs[(j2+0)*rhsStride + k2];
|
||||
const Scalar* b1 = &rhs[(j2+1)*rhsStride + k2];
|
||||
const Scalar* b2 = &rhs[(j2+2)*rhsStride + k2];
|
||||
const Scalar* b3 = &rhs[(j2+3)*rhsStride + k2];
|
||||
for(int k=0; k<actual_kc; k++)
|
||||
{
|
||||
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[k]));
|
||||
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(b1[k]));
|
||||
if (nr==4)
|
||||
{
|
||||
ei_pstore(&blockB[count+2*PacketSize], ei_pset1(b2[k]));
|
||||
ei_pstore(&blockB[count+3*PacketSize], ei_pset1(b3[k]));
|
||||
}
|
||||
count += nr*PacketSize;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// => GEPP_VAR1
|
||||
for(int i2=0; i2<rows; i2+=mc)
|
||||
{
|
||||
const int actual_mc = std::min(i2+mc,rows)-i2;
|
||||
|
||||
// We have selected a mc x kc block of lhs
|
||||
// Let's pack it in a clever order for further purely sequential access
|
||||
int count = 0;
|
||||
const int peeled_mc = (actual_mc/mr)*mr;
|
||||
if (lhsRowMajor)
|
||||
{
|
||||
for(int i=0; i<peeled_mc; i+=mr)
|
||||
for(int k=0; k<actual_kc; k++)
|
||||
for(int w=0; w<mr; w++)
|
||||
blockA[count++] = lhs[(k2+k) + (i2+i+w)*lhsStride];
|
||||
for(int i=peeled_mc; i<actual_mc; i++)
|
||||
{
|
||||
const Scalar* llhs = &lhs[(k2) + (i2+i)*lhsStride];
|
||||
for(int k=0; k<actual_kc; k++)
|
||||
blockA[count++] = llhs[k];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(int i=0; i<peeled_mc; i+=mr)
|
||||
for(int k=0; k<actual_kc; k++)
|
||||
for(int w=0; w<mr; w++)
|
||||
blockA[count++] = lhs[(k2+k)*lhsStride + i2+i+w];
|
||||
for(int i=peeled_mc; i<actual_mc; i++)
|
||||
for(int k=0; k<actual_kc; k++)
|
||||
blockA[count++] = lhs[(k2+k)*lhsStride + i2+i];
|
||||
}
|
||||
|
||||
// GEBP
|
||||
// loops on each cache friendly block of the result/rhs
|
||||
for(int j2=0; j2<packet_cols; j2+=nr)
|
||||
{
|
||||
// loops on each register blocking of lhs/res
|
||||
const int peeled_mc = (actual_mc/mr)*mr;
|
||||
for(int i=0; i<peeled_mc; i+=mr)
|
||||
{
|
||||
const Scalar* blA = &blockA[i*actual_kc];
|
||||
#ifdef EIGEN_VECTORIZE_SSE
|
||||
_mm_prefetch(&blA[0], _MM_HINT_T0);
|
||||
#endif
|
||||
|
||||
// TODO move the res loads to the stores
|
||||
|
||||
// gets res block as register
|
||||
PacketType C0, C1, C2, C3, C4, C5, C6, C7;
|
||||
C0 = ei_ploadu(&res[(j2+0)*resStride + i2 + i]);
|
||||
C1 = ei_ploadu(&res[(j2+1)*resStride + i2 + i]);
|
||||
if(nr==4) C2 = ei_ploadu(&res[(j2+2)*resStride + i2 + i]);
|
||||
if(nr==4) C3 = ei_ploadu(&res[(j2+3)*resStride + i2 + i]);
|
||||
C4 = ei_ploadu(&res[(j2+0)*resStride + i2 + i + PacketSize]);
|
||||
C5 = ei_ploadu(&res[(j2+1)*resStride + i2 + i + PacketSize]);
|
||||
if(nr==4) C6 = ei_ploadu(&res[(j2+2)*resStride + i2 + i + PacketSize]);
|
||||
if(nr==4) C7 = ei_ploadu(&res[(j2+3)*resStride + i2 + i + PacketSize]);
|
||||
|
||||
// performs "inner" product
|
||||
// TODO let's check wether the flowing peeled loop could not be
|
||||
// optimized via optimal prefetching from one loop to the other
|
||||
const Scalar* blB = &blockB[j2*actual_kc*PacketSize];
|
||||
const int peeled_kc = (actual_kc/4)*4;
|
||||
for(int k=0; k<peeled_kc; k+=4)
|
||||
{
|
||||
PacketType B0, B1, B2, B3, A0, A1;
|
||||
|
||||
A0 = ei_pload(&blA[0*PacketSize]);
|
||||
A1 = ei_pload(&blA[1*PacketSize]);
|
||||
B0 = ei_pload(&blB[0*PacketSize]);
|
||||
B1 = ei_pload(&blB[1*PacketSize]);
|
||||
C0 = ei_pmadd(B0, A0, C0);
|
||||
if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
|
||||
C4 = ei_pmadd(B0, A1, C4);
|
||||
if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
|
||||
B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]);
|
||||
C1 = ei_pmadd(B1, A0, C1);
|
||||
C5 = ei_pmadd(B1, A1, C5);
|
||||
B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]);
|
||||
if(nr==4) C2 = ei_pmadd(B2, A0, C2);
|
||||
if(nr==4) C6 = ei_pmadd(B2, A1, C6);
|
||||
if(nr==4) B2 = ei_pload(&blB[6*PacketSize]);
|
||||
if(nr==4) C3 = ei_pmadd(B3, A0, C3);
|
||||
A0 = ei_pload(&blA[2*PacketSize]);
|
||||
if(nr==4) C7 = ei_pmadd(B3, A1, C7);
|
||||
A1 = ei_pload(&blA[3*PacketSize]);
|
||||
if(nr==4) B3 = ei_pload(&blB[7*PacketSize]);
|
||||
C0 = ei_pmadd(B0, A0, C0);
|
||||
C4 = ei_pmadd(B0, A1, C4);
|
||||
B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]);
|
||||
C1 = ei_pmadd(B1, A0, C1);
|
||||
C5 = ei_pmadd(B1, A1, C5);
|
||||
B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]);
|
||||
if(nr==4) C2 = ei_pmadd(B2, A0, C2);
|
||||
if(nr==4) C6 = ei_pmadd(B2, A1, C6);
|
||||
if(nr==4) B2 = ei_pload(&blB[10*PacketSize]);
|
||||
if(nr==4) C3 = ei_pmadd(B3, A0, C3);
|
||||
A0 = ei_pload(&blA[4*PacketSize]);
|
||||
if(nr==4) C7 = ei_pmadd(B3, A1, C7);
|
||||
A1 = ei_pload(&blA[5*PacketSize]);
|
||||
if(nr==4) B3 = ei_pload(&blB[11*PacketSize]);
|
||||
|
||||
C0 = ei_pmadd(B0, A0, C0);
|
||||
C4 = ei_pmadd(B0, A1, C4);
|
||||
B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]);
|
||||
C1 = ei_pmadd(B1, A0, C1);
|
||||
C5 = ei_pmadd(B1, A1, C5);
|
||||
B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]);
|
||||
if(nr==4) C2 = ei_pmadd(B2, A0, C2);
|
||||
if(nr==4) C6 = ei_pmadd(B2, A1, C6);
|
||||
if(nr==4) B2 = ei_pload(&blB[14*PacketSize]);
|
||||
if(nr==4) C3 = ei_pmadd(B3, A0, C3);
|
||||
A0 = ei_pload(&blA[6*PacketSize]);
|
||||
if(nr==4) C7 = ei_pmadd(B3, A1, C7);
|
||||
A1 = ei_pload(&blA[7*PacketSize]);
|
||||
if(nr==4) B3 = ei_pload(&blB[15*PacketSize]);
|
||||
C0 = ei_pmadd(B0, A0, C0);
|
||||
C4 = ei_pmadd(B0, A1, C4);
|
||||
C1 = ei_pmadd(B1, A0, C1);
|
||||
C5 = ei_pmadd(B1, A1, C5);
|
||||
if(nr==4) C2 = ei_pmadd(B2, A0, C2);
|
||||
if(nr==4) C6 = ei_pmadd(B2, A1, C6);
|
||||
if(nr==4) C3 = ei_pmadd(B3, A0, C3);
|
||||
if(nr==4) C7 = ei_pmadd(B3, A1, C7);
|
||||
|
||||
blB += 4*nr*PacketSize;
|
||||
blA += 4*mr;
|
||||
}
|
||||
for(int k=peeled_kc; k<actual_kc; k++)
|
||||
{
|
||||
PacketType B0, B1, B2, B3, A0, A1;
|
||||
|
||||
A0 = ei_pload(&blA[0*PacketSize]);
|
||||
A1 = ei_pload(&blA[1*PacketSize]);
|
||||
B0 = ei_pload(&blB[0*PacketSize]);
|
||||
B1 = ei_pload(&blB[1*PacketSize]);
|
||||
C0 = ei_pmadd(B0, A0, C0);
|
||||
if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
|
||||
C4 = ei_pmadd(B0, A1, C4);
|
||||
if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
|
||||
C1 = ei_pmadd(B1, A0, C1);
|
||||
C5 = ei_pmadd(B1, A1, C5);
|
||||
if(nr==4) C2 = ei_pmadd(B2, A0, C2);
|
||||
if(nr==4) C6 = ei_pmadd(B2, A1, C6);
|
||||
if(nr==4) C3 = ei_pmadd(B3, A0, C3);
|
||||
if(nr==4) C7 = ei_pmadd(B3, A1, C7);
|
||||
|
||||
blB += nr*PacketSize;
|
||||
blA += mr;
|
||||
}
|
||||
|
||||
ei_pstoreu(&res[(j2+0)*resStride + i2 + i], C0);
|
||||
ei_pstoreu(&res[(j2+1)*resStride + i2 + i], C1);
|
||||
if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i2 + i], C2);
|
||||
if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i2 + i], C3);
|
||||
ei_pstoreu(&res[(j2+0)*resStride + i2 + i + PacketSize], C4);
|
||||
ei_pstoreu(&res[(j2+1)*resStride + i2 + i + PacketSize], C5);
|
||||
if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i2 + i + PacketSize], C6);
|
||||
if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i2 + i + PacketSize], C7);
|
||||
}
|
||||
for(int i=peeled_mc; i<actual_mc; i++)
|
||||
{
|
||||
const Scalar* blA = &blockA[i*actual_kc];
|
||||
#ifdef EIGEN_VECTORIZE_SSE
|
||||
_mm_prefetch(&blA[0], _MM_HINT_T0);
|
||||
#endif
|
||||
|
||||
// gets a 1 x nr res block as registers
|
||||
Scalar C0(0), C1(0), C2(0), C3(0);
|
||||
const Scalar* blB = &blockB[j2*actual_kc*PacketSize];
|
||||
for(int k=0; k<actual_kc; k++)
|
||||
{
|
||||
Scalar B0, B1, B2, B3, A0;
|
||||
|
||||
A0 = blA[k];
|
||||
B0 = blB[0*PacketSize];
|
||||
B1 = blB[1*PacketSize];
|
||||
C0 += B0 * A0;
|
||||
if(nr==4) B2 = blB[2*PacketSize];
|
||||
if(nr==4) B3 = blB[3*PacketSize];
|
||||
C1 += B1 * A0;
|
||||
if(nr==4) C2 += B2 * A0;
|
||||
if(nr==4) C3 += B3 * A0;
|
||||
|
||||
blB += nr*PacketSize;
|
||||
}
|
||||
res[(j2+0)*resStride + i2 + i] += C0;
|
||||
res[(j2+1)*resStride + i2 + i] += C1;
|
||||
if(nr==4) res[(j2+2)*resStride + i2 + i] += C2;
|
||||
if(nr==4) res[(j2+3)*resStride + i2 + i] += C3;
|
||||
}
|
||||
}
|
||||
// remaining rhs/res columns (<nr)
|
||||
for(int j2=packet_cols; j2<cols; j2++)
|
||||
{
|
||||
for(int i=0; i<actual_mc; i++)
|
||||
{
|
||||
Scalar c0 = res[(j2)*resStride + i2+i];
|
||||
if (lhsRowMajor)
|
||||
for(int k=0; k<actual_kc; k++)
|
||||
c0 += lhs[(k2+k)+(i2+i)*lhsStride] * rhs[j2*rhsStride + k2 + k];
|
||||
else
|
||||
for(int k=0; k<actual_kc; k++)
|
||||
c0 += lhs[(k2+k)*lhsStride + i2+i] * rhs[j2*rhsStride + k2 + k];
|
||||
res[(j2)*resStride + i2+i] = c0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ei_aligned_stack_delete(Scalar, blockA, kc*mc);
|
||||
ei_aligned_stack_delete(Scalar, blockB, kc*cols*PacketSize);
|
||||
|
||||
#else // alternate product from cylmor
|
||||
|
||||
enum {
|
||||
PacketSize = sizeof(PacketType)/sizeof(Scalar),
|
||||
#if (defined __i386__)
|
||||
@ -83,284 +363,41 @@ static void ei_cache_friendly_product(
|
||||
// maximal size of the blocks fitted in L2 cache
|
||||
MaxL2BlockSize = ei_L2_block_traits<EIGEN_TUNE_FOR_CPU_CACHE_SIZE,Scalar>::width
|
||||
};
|
||||
|
||||
|
||||
const bool resIsAligned = (PacketSize==1) || (((resStride%PacketSize) == 0) && (size_t(res)%16==0));
|
||||
|
||||
const int remainingSize = depth % PacketSize;
|
||||
const int size = depth - remainingSize; // third dimension of the product clamped to packet boundaries
|
||||
const int l2BlockRows = MaxL2BlockSize > rows ? rows : MaxL2BlockSize;
|
||||
const int l2BlockCols = MaxL2BlockSize > cols ? cols : MaxL2BlockSize;
|
||||
const int l2BlockSize = MaxL2BlockSize > size ? size : MaxL2BlockSize;
|
||||
|
||||
const int l2BlockRows = MaxL2BlockSize > rows ? rows : 512;
|
||||
const int l2BlockCols = MaxL2BlockSize > cols ? cols : 128;
|
||||
const int l2BlockSize = MaxL2BlockSize > size ? size : 256;
|
||||
const int l2BlockSizeAligned = (1 + std::max(l2BlockSize,l2BlockCols)/PacketSize)*PacketSize;
|
||||
const bool needRhsCopy = (PacketSize>1) && ((rhsStride%PacketSize!=0) || (size_t(rhs)%16!=0));
|
||||
Scalar* EIGEN_RESTRICT block = 0;
|
||||
const int allocBlockSize = l2BlockRows*size;
|
||||
block = ei_aligned_stack_new(Scalar, allocBlockSize);
|
||||
Scalar* EIGEN_RESTRICT rhsCopy
|
||||
= ei_aligned_stack_new(Scalar, l2BlockSizeAligned*l2BlockSizeAligned);
|
||||
|
||||
#ifndef EIGEN_USE_NEW_PRODUCT
|
||||
// loops on each L2 cache friendly blocks of the result
|
||||
for(int l2i=0; l2i<rows; l2i+=l2BlockRows)
|
||||
{
|
||||
const int l2blockRowEnd = std::min(l2i+l2BlockRows, rows);
|
||||
const int l2blockRowEndBW = l2blockRowEnd & MaxBlockRows_ClampingMask; // end of the rows aligned to bw
|
||||
const int l2blockRemainingRows = l2blockRowEnd - l2blockRowEndBW; // number of remaining rows
|
||||
//const int l2blockRowEndBWPlusOne = l2blockRowEndBW + (l2blockRemainingRows?0:MaxBlockRows);
|
||||
|
||||
// build a cache friendly blocky matrix
|
||||
int count = 0;
|
||||
|
||||
// copy l2blocksize rows of m_lhs to blocks of ps x bw
|
||||
for(int l2k=0; l2k<size; l2k+=l2BlockSize)
|
||||
{
|
||||
const int l2blockSizeEnd = std::min(l2k+l2BlockSize, size);
|
||||
|
||||
for (int i = l2i; i<l2blockRowEndBW/*PlusOne*/; i+=MaxBlockRows)
|
||||
{
|
||||
// TODO merge the "if l2blockRemainingRows" using something like:
|
||||
// const int blockRows = std::min(i+MaxBlockRows, rows) - i;
|
||||
|
||||
for (int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
|
||||
{
|
||||
// TODO write these loops using meta unrolling
|
||||
// negligible for large matrices but useful for small ones
|
||||
if (lhsRowMajor)
|
||||
{
|
||||
for (int w=0; w<MaxBlockRows; ++w)
|
||||
for (int s=0; s<PacketSize; ++s)
|
||||
block[count++] = lhs[(i+w)*lhsStride + (k+s)];
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int w=0; w<MaxBlockRows; ++w)
|
||||
for (int s=0; s<PacketSize; ++s)
|
||||
block[count++] = lhs[(i+w) + (k+s)*lhsStride];
|
||||
}
|
||||
}
|
||||
}
|
||||
if (l2blockRemainingRows>0)
|
||||
{
|
||||
for (int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
|
||||
{
|
||||
if (lhsRowMajor)
|
||||
{
|
||||
for (int w=0; w<l2blockRemainingRows; ++w)
|
||||
for (int s=0; s<PacketSize; ++s)
|
||||
block[count++] = lhs[(l2blockRowEndBW+w)*lhsStride + (k+s)];
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int w=0; w<l2blockRemainingRows; ++w)
|
||||
for (int s=0; s<PacketSize; ++s)
|
||||
block[count++] = lhs[(l2blockRowEndBW+w) + (k+s)*lhsStride];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for(int l2j=0; l2j<cols; l2j+=l2BlockCols)
|
||||
{
|
||||
int l2blockColEnd = std::min(l2j+l2BlockCols, cols);
|
||||
|
||||
for(int l2k=0; l2k<size; l2k+=l2BlockSize)
|
||||
{
|
||||
// acumulate bw rows of lhs time a single column of rhs to a bw x 1 block of res
|
||||
int l2blockSizeEnd = std::min(l2k+l2BlockSize, size);
|
||||
|
||||
// if not aligned, copy the rhs block
|
||||
if (needRhsCopy)
|
||||
for(int l1j=l2j; l1j<l2blockColEnd; l1j+=1)
|
||||
{
|
||||
ei_internal_assert(l2BlockSizeAligned*(l1j-l2j)+(l2blockSizeEnd-l2k) < l2BlockSizeAligned*l2BlockSizeAligned);
|
||||
memcpy(rhsCopy+l2BlockSizeAligned*(l1j-l2j),&(rhs[l1j*rhsStride+l2k]),(l2blockSizeEnd-l2k)*sizeof(Scalar));
|
||||
}
|
||||
|
||||
// for each bw x 1 result's block
|
||||
for(int l1i=l2i; l1i<l2blockRowEndBW; l1i+=MaxBlockRows)
|
||||
{
|
||||
int offsetblock = l2k * (l2blockRowEnd-l2i) + (l1i-l2i)*(l2blockSizeEnd-l2k) - l2k*MaxBlockRows;
|
||||
const Scalar* EIGEN_RESTRICT localB = &block[offsetblock];
|
||||
|
||||
for(int l1j=l2j; l1j<l2blockColEnd; l1j+=1)
|
||||
{
|
||||
const Scalar* EIGEN_RESTRICT rhsColumn;
|
||||
if (needRhsCopy)
|
||||
rhsColumn = &(rhsCopy[l2BlockSizeAligned*(l1j-l2j)-l2k]);
|
||||
else
|
||||
rhsColumn = &(rhs[l1j*rhsStride]);
|
||||
|
||||
PacketType dst[MaxBlockRows];
|
||||
dst[3] = dst[2] = dst[1] = dst[0] = ei_pset1(Scalar(0.));
|
||||
if (MaxBlockRows==8)
|
||||
dst[7] = dst[6] = dst[5] = dst[4] = dst[0];
|
||||
|
||||
PacketType tmp;
|
||||
|
||||
for(int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
|
||||
{
|
||||
tmp = ei_ploadu(&rhsColumn[k]);
|
||||
PacketType A0, A1, A2, A3, A4, A5;
|
||||
A0 = ei_pload(localB + k*MaxBlockRows);
|
||||
A1 = ei_pload(localB + k*MaxBlockRows+1*PacketSize);
|
||||
A2 = ei_pload(localB + k*MaxBlockRows+2*PacketSize);
|
||||
A3 = ei_pload(localB + k*MaxBlockRows+3*PacketSize);
|
||||
if (MaxBlockRows==8) A4 = ei_pload(localB + k*MaxBlockRows+4*PacketSize);
|
||||
if (MaxBlockRows==8) A5 = ei_pload(localB + k*MaxBlockRows+5*PacketSize);
|
||||
dst[0] = ei_pmadd(tmp, A0, dst[0]);
|
||||
if (MaxBlockRows==8) A0 = ei_pload(localB + k*MaxBlockRows+6*PacketSize);
|
||||
dst[1] = ei_pmadd(tmp, A1, dst[1]);
|
||||
if (MaxBlockRows==8) A1 = ei_pload(localB + k*MaxBlockRows+7*PacketSize);
|
||||
dst[2] = ei_pmadd(tmp, A2, dst[2]);
|
||||
dst[3] = ei_pmadd(tmp, A3, dst[3]);
|
||||
if (MaxBlockRows==8)
|
||||
{
|
||||
dst[4] = ei_pmadd(tmp, A4, dst[4]);
|
||||
dst[5] = ei_pmadd(tmp, A5, dst[5]);
|
||||
dst[6] = ei_pmadd(tmp, A0, dst[6]);
|
||||
dst[7] = ei_pmadd(tmp, A1, dst[7]);
|
||||
}
|
||||
}
|
||||
|
||||
Scalar* EIGEN_RESTRICT localRes = &(res[l1i + l1j*resStride]);
|
||||
|
||||
if (PacketSize>1 && resIsAligned)
|
||||
{
|
||||
// the result is aligned: let's do packet reduction
|
||||
ei_pstore(&(localRes[0]), ei_padd(ei_pload(&(localRes[0])), ei_preduxp(&dst[0])));
|
||||
if (PacketSize==2)
|
||||
ei_pstore(&(localRes[2]), ei_padd(ei_pload(&(localRes[2])), ei_preduxp(&(dst[2]))));
|
||||
if (MaxBlockRows==8)
|
||||
{
|
||||
ei_pstore(&(localRes[4]), ei_padd(ei_pload(&(localRes[4])), ei_preduxp(&(dst[4]))));
|
||||
if (PacketSize==2)
|
||||
ei_pstore(&(localRes[6]), ei_padd(ei_pload(&(localRes[6])), ei_preduxp(&(dst[6]))));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// not aligned => per coeff packet reduction
|
||||
localRes[0] += ei_predux(dst[0]);
|
||||
localRes[1] += ei_predux(dst[1]);
|
||||
localRes[2] += ei_predux(dst[2]);
|
||||
localRes[3] += ei_predux(dst[3]);
|
||||
if (MaxBlockRows==8)
|
||||
{
|
||||
localRes[4] += ei_predux(dst[4]);
|
||||
localRes[5] += ei_predux(dst[5]);
|
||||
localRes[6] += ei_predux(dst[6]);
|
||||
localRes[7] += ei_predux(dst[7]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (l2blockRemainingRows>0)
|
||||
{
|
||||
int offsetblock = l2k * (l2blockRowEnd-l2i) + (l2blockRowEndBW-l2i)*(l2blockSizeEnd-l2k) - l2k*l2blockRemainingRows;
|
||||
const Scalar* localB = &block[offsetblock];
|
||||
|
||||
for(int l1j=l2j; l1j<l2blockColEnd; l1j+=1)
|
||||
{
|
||||
const Scalar* EIGEN_RESTRICT rhsColumn;
|
||||
if (needRhsCopy)
|
||||
rhsColumn = &(rhsCopy[l2BlockSizeAligned*(l1j-l2j)-l2k]);
|
||||
else
|
||||
rhsColumn = &(rhs[l1j*rhsStride]);
|
||||
|
||||
PacketType dst[MaxBlockRows];
|
||||
dst[3] = dst[2] = dst[1] = dst[0] = ei_pset1(Scalar(0.));
|
||||
if (MaxBlockRows==8)
|
||||
dst[7] = dst[6] = dst[5] = dst[4] = dst[0];
|
||||
|
||||
// let's declare a few other temporary registers
|
||||
PacketType tmp;
|
||||
|
||||
for(int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
|
||||
{
|
||||
tmp = ei_pload(&rhsColumn[k]);
|
||||
|
||||
dst[0] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows ])), dst[0]);
|
||||
if (l2blockRemainingRows>=2) dst[1] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+ PacketSize])), dst[1]);
|
||||
if (l2blockRemainingRows>=3) dst[2] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+2*PacketSize])), dst[2]);
|
||||
if (l2blockRemainingRows>=4) dst[3] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+3*PacketSize])), dst[3]);
|
||||
if (MaxBlockRows==8)
|
||||
{
|
||||
if (l2blockRemainingRows>=5) dst[4] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+4*PacketSize])), dst[4]);
|
||||
if (l2blockRemainingRows>=6) dst[5] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+5*PacketSize])), dst[5]);
|
||||
if (l2blockRemainingRows>=7) dst[6] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+6*PacketSize])), dst[6]);
|
||||
if (l2blockRemainingRows>=8) dst[7] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+7*PacketSize])), dst[7]);
|
||||
}
|
||||
}
|
||||
|
||||
Scalar* EIGEN_RESTRICT localRes = &(res[l2blockRowEndBW + l1j*resStride]);
|
||||
|
||||
// process the remaining rows once at a time
|
||||
localRes[0] += ei_predux(dst[0]);
|
||||
if (l2blockRemainingRows>=2) localRes[1] += ei_predux(dst[1]);
|
||||
if (l2blockRemainingRows>=3) localRes[2] += ei_predux(dst[2]);
|
||||
if (l2blockRemainingRows>=4) localRes[3] += ei_predux(dst[3]);
|
||||
if (MaxBlockRows==8)
|
||||
{
|
||||
if (l2blockRemainingRows>=5) localRes[4] += ei_predux(dst[4]);
|
||||
if (l2blockRemainingRows>=6) localRes[5] += ei_predux(dst[5]);
|
||||
if (l2blockRemainingRows>=7) localRes[6] += ei_predux(dst[6]);
|
||||
if (l2blockRemainingRows>=8) localRes[7] += ei_predux(dst[7]);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (PacketSize>1 && remainingSize)
|
||||
{
|
||||
if (lhsRowMajor)
|
||||
{
|
||||
for (int j=0; j<cols; ++j)
|
||||
for (int i=0; i<rows; ++i)
|
||||
{
|
||||
Scalar tmp = lhs[i*lhsStride+size] * rhs[j*rhsStride+size];
|
||||
// FIXME this loop get vectorized by the compiler !
|
||||
for (int k=1; k<remainingSize; ++k)
|
||||
tmp += lhs[i*lhsStride+size+k] * rhs[j*rhsStride+size+k];
|
||||
res[i+j*resStride] += tmp;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int j=0; j<cols; ++j)
|
||||
for (int i=0; i<rows; ++i)
|
||||
{
|
||||
Scalar tmp = lhs[i+size*lhsStride] * rhs[j*rhsStride+size];
|
||||
for (int k=1; k<remainingSize; ++k)
|
||||
tmp += lhs[i+(size+k)*lhsStride] * rhs[j*rhsStride+size+k];
|
||||
res[i+j*resStride] += tmp;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
// loops on each L2 cache friendly blocks of the result
|
||||
|
||||
for(int l2j=0; l2j<cols; l2j+=l2BlockCols)
|
||||
|
||||
Scalar* EIGEN_RESTRICT block = new Scalar[l2BlockRows*size];
|
||||
// for(int i=0; i<l2BlockRows*l2BlockSize; ++i)
|
||||
// block[i] = 0;
|
||||
// loops on each L2 cache friendly blocks of lhs
|
||||
for(int l2k=0; l2k<depth; l2k+=l2BlockSize)
|
||||
{
|
||||
for(int l2i=0; l2i<rows; l2i+=l2BlockRows)
|
||||
{
|
||||
// We have selected a block of lhs
|
||||
// Packs this block into 'block'
|
||||
int count = 0;
|
||||
for(int j=0; j<l2BlockCols; j+=MaxBlockRows)
|
||||
for(int k=0; k<l2BlockSize; k+=MaxBlockRows)
|
||||
{
|
||||
for(int i=0; i<l2BlockRows; i+=2*PacketSize)
|
||||
for (int w=0; w<MaxBlockRows; ++w)
|
||||
for (int y=0; y<2*PacketSize; ++y)
|
||||
block[count++] = lhs[(j+l2j+w)*rows + l2i+i+ y];
|
||||
block[count++] = lhs[(k+l2k+w)*lhsStride + l2i+i+ y];
|
||||
}
|
||||
|
||||
// loops on each L2 cache firendly block of the result/rhs
|
||||
for(int l2k=0; l2k<cols; l2k+=l2BlockCols)
|
||||
for(int l2j=0; l2j<cols; l2j+=l2BlockCols)
|
||||
{
|
||||
for(int i=0; i<l2BlockRows; i+=MaxBlockRows)
|
||||
for(int k=0; k<l2BlockSize; k+=MaxBlockRows)
|
||||
{
|
||||
for(int j=0; j<l2BlockCols; ++j)
|
||||
{
|
||||
@ -370,7 +407,7 @@ static void ei_cache_friendly_product(
|
||||
|
||||
// Here we need some vector reordering
|
||||
// Right now its hardcoded to packets of 4 elements
|
||||
const Scalar* lrhs = &rhs[(j+l2k)*rows+(i+l2j)];
|
||||
const Scalar* lrhs = &rhs[(j+l2j)*rhsStride+(k+l2k)];
|
||||
A0 = ei_pset1(lrhs[0]);
|
||||
A1 = ei_pset1(lrhs[1]);
|
||||
A2 = ei_pset1(lrhs[2]);
|
||||
@ -383,16 +420,17 @@ static void ei_cache_friendly_product(
|
||||
A7 = ei_pset1(lrhs[7]);
|
||||
}
|
||||
|
||||
Scalar * lb = &block[l2BlockRows * i];
|
||||
for(int k=0; k<l2BlockRows; k+=2*PacketSize)
|
||||
Scalar * lb = &block[l2BlockRows * k];
|
||||
for(int i=0; i<l2BlockRows; i+=2*PacketSize)
|
||||
{
|
||||
PacketType R0, R1, L0, L1, T0, T1;
|
||||
asm("#begin sgemm");
|
||||
// asm(".byte 0x66;");
|
||||
|
||||
// We perform "cross products" of vectors to avoid
|
||||
// reductions (horizontal ops) afterwards
|
||||
T0 = ei_pload(&res[(j+l2k)*rows+l2i+k]);
|
||||
T1 = ei_pload(&res[(j+l2k)*rows+l2i+k+PacketSize]);
|
||||
T0 = ei_pload(&res[(j+l2j)*resStride+l2i+i]);
|
||||
T1 = ei_pload(&res[(j+l2j)*resStride+l2i+i+PacketSize]);
|
||||
// uncomment to remove res cache miss
|
||||
// T0 = ei_pload(&res[k]);
|
||||
// T1 = ei_pload(&res[k+PacketSize]);
|
||||
@ -437,11 +475,11 @@ static void ei_cache_friendly_product(
|
||||
}
|
||||
lb += MaxBlockRows*2*PacketSize;
|
||||
|
||||
ei_pstore(&res[(j+l2k)*rows+l2i+k], T0);
|
||||
ei_pstore(&res[(j+l2k)*rows+l2i+k+PacketSize], T1);
|
||||
ei_pstore(&res[(j+l2j)*resStride+l2i+i], T0);
|
||||
ei_pstore(&res[(j+l2j)*resStride+l2i+i+PacketSize], T1);
|
||||
// uncomment to remove res cache miss
|
||||
// ei_pstore(&res[k], T0);
|
||||
// ei_pstore(&res[k+PacketSize], T1);
|
||||
// ei_pstore(&res[0], T0);
|
||||
// ei_pstore(&res[4/*k+PacketSize*/], T1);
|
||||
asm("#end sgemm");
|
||||
}
|
||||
}
|
||||
@ -449,10 +487,10 @@ static void ei_cache_friendly_product(
|
||||
}
|
||||
}
|
||||
}
|
||||
delete[] block;
|
||||
#endif
|
||||
|
||||
ei_aligned_stack_delete(Scalar, block, allocBlockSize);
|
||||
ei_aligned_stack_delete(Scalar, rhsCopy, l2BlockSizeAligned*l2BlockSizeAligned);
|
||||
|
||||
}
|
||||
|
||||
#endif // EIGEN_EXTERN_INSTANTIATIONS
|
||||
|
Loading…
Reference in New Issue
Block a user