* refactoring of the matrix product into multiple small kernels

* started an efficient selfadjoint matrix * general matrix product
  based on the generic kernels ( => need a very little LOC)
This commit is contained in:
Gael Guennebaud 2009-07-21 16:58:35 +02:00
parent afa8f2ca95
commit d6627d540e
4 changed files with 533 additions and 375 deletions

View File

@ -180,6 +180,7 @@ namespace Eigen {
#include "src/Core/products/GeneralMatrixMatrix.h"
#include "src/Core/products/GeneralMatrixVector.h"
#include "src/Core/products/SelfadjointMatrixVector.h"
#include "src/Core/products/SelfadjointMatrixMatrix.h"
#include "src/Core/TriangularMatrix.h"
#include "src/Core/SelfAdjointView.h"
#include "src/Core/SolveTriangular.h"

View File

@ -25,6 +25,15 @@
#ifndef EIGEN_GENERAL_MATRIX_MATRIX_H
#define EIGEN_GENERAL_MATRIX_MATRIX_H
template<typename Scalar, typename Packet, int PacketSize, int mr, int nr, typename Conj>
struct ei_gebp_kernel;
template<typename Scalar, int PacketSize, int nr>
struct ei_gemm_pack_rhs;
template<typename Scalar, int mr>
struct ei_gemm_pack_lhs;
template <int L2MemorySize,typename Scalar>
struct ei_L2_block_traits {
enum {width = 8 * ei_meta_sqrt<L2MemorySize/(64*sizeof(Scalar))>::ret };
@ -124,10 +133,6 @@ static void ei_cache_friendly_product(
typedef typename ei_packet_traits<Scalar>::type PacketType;
#ifndef EIGEN_USE_ALT_PRODUCT
enum {
PacketSize = sizeof(PacketType)/sizeof(Scalar),
#if (defined __i386__)
@ -166,398 +171,346 @@ static void ei_cache_friendly_product(
// we have selected one row panel of rhs and one column panel of lhs
// pack rhs's panel into a sequential chunk of memory
// and expand each coeff to a constant packet for further reuse
{
int count = 0;
for(int j2=0; j2<packet_cols; j2+=nr)
{
const Scalar* b0 = &rhs[(j2+0)*rhsStride + k2];
const Scalar* b1 = &rhs[(j2+1)*rhsStride + k2];
const Scalar* b2 = &rhs[(j2+2)*rhsStride + k2];
const Scalar* b3 = &rhs[(j2+3)*rhsStride + k2];
if (hasAlpha)
{
for(int k=0; k<actual_kc; k++)
{
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[k]));
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*b1[k]));
if (nr==4)
{
ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*b2[k]));
ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*b3[k]));
}
count += nr*PacketSize;
}
}
else
{
for(int k=0; k<actual_kc; k++)
{
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[k]));
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(b1[k]));
if (nr==4)
{
ei_pstore(&blockB[count+2*PacketSize], ei_pset1(b2[k]));
ei_pstore(&blockB[count+3*PacketSize], ei_pset1(b3[k]));
}
count += nr*PacketSize;
}
}
}
}
ei_gemm_pack_rhs<Scalar,PacketSize,nr>()(blockB, rhs, rhsStride, hasAlpha, alpha, actual_kc, packet_cols, k2, cols);
// => GEPP_VAR1
for(int i2=0; i2<rows; i2+=mc)
{
const int actual_mc = std::min(i2+mc,rows)-i2;
// We have selected a mc x kc block of lhs
// Let's pack it in a clever order for further purely sequential access
int count = 0;
const int peeled_mc = (actual_mc/mr)*mr;
if (lhsRowMajor)
{
for(int i=0; i<peeled_mc; i+=mr)
for(int k=0; k<actual_kc; k++)
for(int w=0; w<mr; w++)
blockA[count++] = lhs[(k2+k) + (i2+i+w)*lhsStride];
for(int i=peeled_mc; i<actual_mc; i++)
{
const Scalar* llhs = &lhs[(k2) + (i2+i)*lhsStride];
for(int k=0; k<actual_kc; k++)
blockA[count++] = llhs[k];
}
}
else
{
for(int i=0; i<peeled_mc; i+=mr)
for(int k=0; k<actual_kc; k++)
for(int w=0; w<mr; w++)
blockA[count++] = lhs[(k2+k)*lhsStride + i2+i+w];
for(int i=peeled_mc; i<actual_mc; i++)
for(int k=0; k<actual_kc; k++)
blockA[count++] = lhs[(k2+k)*lhsStride + i2+i];
}
ei_gemm_pack_lhs<Scalar,mr>()(blockA, lhs, lhsStride, lhsRowMajor, actual_kc, actual_mc, k2, i2);
// GEBP
// loops on each cache friendly block of the result/rhs
for(int j2=0; j2<packet_cols; j2+=nr)
{
// loops on each register blocking of lhs/res
const int peeled_mc = (actual_mc/mr)*mr;
for(int i=0; i<peeled_mc; i+=mr)
{
const Scalar* blA = &blockA[i*actual_kc];
#ifdef EIGEN_VECTORIZE_SSE
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
#endif
// TODO move the res loads to the stores
// gets res block as register
PacketType C0, C1, C2, C3, C4, C5, C6, C7;
C0 = ei_ploadu(&res[(j2+0)*resStride + i2 + i]);
C1 = ei_ploadu(&res[(j2+1)*resStride + i2 + i]);
if(nr==4) C2 = ei_ploadu(&res[(j2+2)*resStride + i2 + i]);
if(nr==4) C3 = ei_ploadu(&res[(j2+3)*resStride + i2 + i]);
C4 = ei_ploadu(&res[(j2+0)*resStride + i2 + i + PacketSize]);
C5 = ei_ploadu(&res[(j2+1)*resStride + i2 + i + PacketSize]);
if(nr==4) C6 = ei_ploadu(&res[(j2+2)*resStride + i2 + i + PacketSize]);
if(nr==4) C7 = ei_ploadu(&res[(j2+3)*resStride + i2 + i + PacketSize]);
// performs "inner" product
// TODO let's check wether the flowing peeled loop could not be
// optimized via optimal prefetching from one loop to the other
const Scalar* blB = &blockB[j2*actual_kc*PacketSize];
const int peeled_kc = (actual_kc/4)*4;
for(int k=0; k<peeled_kc; k+=4)
{
PacketType B0, B1, B2, B3, A0, A1;
A0 = ei_pload(&blA[0*PacketSize]);
A1 = ei_pload(&blA[1*PacketSize]);
B0 = ei_pload(&blB[0*PacketSize]);
B1 = ei_pload(&blB[1*PacketSize]);
C0 = cj.pmadd(A0, B0, C0);
if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
C4 = cj.pmadd(A1, B0, C4);
if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]);
C1 = cj.pmadd(A0, B1, C1);
C5 = cj.pmadd(A1, B1, C5);
B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]);
if(nr==4) C2 = cj.pmadd(A0, B2, C2);
if(nr==4) C6 = cj.pmadd(A1, B2, C6);
if(nr==4) B2 = ei_pload(&blB[6*PacketSize]);
if(nr==4) C3 = cj.pmadd(A0, B3, C3);
A0 = ei_pload(&blA[2*PacketSize]);
if(nr==4) C7 = cj.pmadd(A1, B3, C7);
A1 = ei_pload(&blA[3*PacketSize]);
if(nr==4) B3 = ei_pload(&blB[7*PacketSize]);
C0 = cj.pmadd(A0, B0, C0);
C4 = cj.pmadd(A1, B0, C4);
B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]);
C1 = cj.pmadd(A0, B1, C1);
C5 = cj.pmadd(A1, B1, C5);
B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]);
if(nr==4) C2 = cj.pmadd(A0, B2, C2);
if(nr==4) C6 = cj.pmadd(A1, B2, C6);
if(nr==4) B2 = ei_pload(&blB[10*PacketSize]);
if(nr==4) C3 = cj.pmadd(A0, B3, C3);
A0 = ei_pload(&blA[4*PacketSize]);
if(nr==4) C7 = cj.pmadd(A1, B3, C7);
A1 = ei_pload(&blA[5*PacketSize]);
if(nr==4) B3 = ei_pload(&blB[11*PacketSize]);
C0 = cj.pmadd(A0, B0, C0);
C4 = cj.pmadd(A1, B0, C4);
B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]);
C1 = cj.pmadd(A0, B1, C1);
C5 = cj.pmadd(A1, B1, C5);
B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]);
if(nr==4) C2 = cj.pmadd(A0, B2, C2);
if(nr==4) C6 = cj.pmadd(A1, B2, C6);
if(nr==4) B2 = ei_pload(&blB[14*PacketSize]);
if(nr==4) C3 = cj.pmadd(A0, B3, C3);
A0 = ei_pload(&blA[6*PacketSize]);
if(nr==4) C7 = cj.pmadd(A1, B3, C7);
A1 = ei_pload(&blA[7*PacketSize]);
if(nr==4) B3 = ei_pload(&blB[15*PacketSize]);
C0 = cj.pmadd(A0, B0, C0);
C4 = cj.pmadd(A1, B0, C4);
C1 = cj.pmadd(A0, B1, C1);
C5 = cj.pmadd(A1, B1, C5);
if(nr==4) C2 = cj.pmadd(A0, B2, C2);
if(nr==4) C6 = cj.pmadd(A1, B2, C6);
if(nr==4) C3 = cj.pmadd(A0, B3, C3);
if(nr==4) C7 = cj.pmadd(A1, B3, C7);
blB += 4*nr*PacketSize;
blA += 4*mr;
}
// process remaining peeled loop
for(int k=peeled_kc; k<actual_kc; k++)
{
PacketType B0, B1, B2, B3, A0, A1;
A0 = ei_pload(&blA[0*PacketSize]);
A1 = ei_pload(&blA[1*PacketSize]);
B0 = ei_pload(&blB[0*PacketSize]);
B1 = ei_pload(&blB[1*PacketSize]);
C0 = cj.pmadd(A0, B0, C0);
if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
C4 = cj.pmadd(A1, B0, C4);
if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
C1 = cj.pmadd(A0, B1, C1);
C5 = cj.pmadd(A1, B1, C5);
if(nr==4) C2 = cj.pmadd(A0, B2, C2);
if(nr==4) C6 = cj.pmadd(A1, B2, C6);
if(nr==4) C3 = cj.pmadd(A0, B3, C3);
if(nr==4) C7 = cj.pmadd(A1, B3, C7);
blB += nr*PacketSize;
blA += mr;
}
ei_pstoreu(&res[(j2+0)*resStride + i2 + i], C0);
ei_pstoreu(&res[(j2+1)*resStride + i2 + i], C1);
if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i2 + i], C2);
if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i2 + i], C3);
ei_pstoreu(&res[(j2+0)*resStride + i2 + i + PacketSize], C4);
ei_pstoreu(&res[(j2+1)*resStride + i2 + i + PacketSize], C5);
if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i2 + i + PacketSize], C6);
if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i2 + i + PacketSize], C7);
}
for(int i=peeled_mc; i<actual_mc; i++)
{
const Scalar* blA = &blockA[i*actual_kc];
#ifdef EIGEN_VECTORIZE_SSE
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
#endif
// gets a 1 x nr res block as registers
Scalar C0(0), C1(0), C2(0), C3(0);
const Scalar* blB = &blockB[j2*actual_kc*PacketSize];
for(int k=0; k<actual_kc; k++)
{
Scalar B0, B1, B2, B3, A0;
A0 = blA[k];
B0 = blB[0*PacketSize];
B1 = blB[1*PacketSize];
C0 = cj.pmadd(A0, B0, C0);
if(nr==4) B2 = blB[2*PacketSize];
if(nr==4) B3 = blB[3*PacketSize];
C1 = cj.pmadd(A0, B1, C1);
if(nr==4) C2 = cj.pmadd(A0, B2, C2);
if(nr==4) C3 = cj.pmadd(A0, B3, C3);
blB += nr*PacketSize;
}
res[(j2+0)*resStride + i2 + i] += C0;
res[(j2+1)*resStride + i2 + i] += C1;
if(nr==4) res[(j2+2)*resStride + i2 + i] += C2;
if(nr==4) res[(j2+3)*resStride + i2 + i] += C3;
}
}
// remaining rhs/res columns (<nr)
for(int j2=packet_cols; j2<cols; j2++)
{
for(int i=0; i<actual_mc; i++)
{
Scalar c0 = Scalar(0);
if (lhsRowMajor)
for(int k=0; k<actual_kc; k++)
c0 += cj.pmul(lhs[(k2+k)+(i2+i)*lhsStride], rhs[j2*rhsStride + k2 + k]);
else
for(int k=0; k<actual_kc; k++)
c0 += cj.pmul(lhs[(k2+k)*lhsStride + i2+i], rhs[j2*rhsStride + k2 + k]);
res[(j2)*resStride + i2+i] += (ConjugateRhs ? ei_conj(alpha) : alpha) * c0;
}
}
ei_gebp_kernel<Scalar, PacketType, PacketSize, mr, nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> >()
(res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, i2, cols);
}
}
ei_aligned_stack_delete(Scalar, blockA, kc*mc);
ei_aligned_stack_delete(Scalar, blockB, kc*cols*PacketSize);
}
#else // alternate product from cylmor
enum {
PacketSize = sizeof(PacketType)/sizeof(Scalar),
#if (defined __i386__)
// i386 architecture provides only 8 xmm registers,
// so let's reduce the max number of rows processed at once.
MaxBlockRows = 4,
MaxBlockRows_ClampingMask = 0xFFFFFC,
#else
MaxBlockRows = 8,
MaxBlockRows_ClampingMask = 0xFFFFF8,
#endif
// maximal size of the blocks fitted in L2 cache
MaxL2BlockSize = ei_L2_block_traits<EIGEN_TUNE_FOR_CPU_CACHE_SIZE,Scalar>::width
};
const bool resIsAligned = (PacketSize==1) || (((resStride%PacketSize) == 0) && (size_t(res)%16==0));
const int remainingSize = depth % PacketSize;
const int size = depth - remainingSize; // third dimension of the product clamped to packet boundaries
const int l2BlockRows = MaxL2BlockSize > rows ? rows : 512;
const int l2BlockCols = MaxL2BlockSize > cols ? cols : 128;
const int l2BlockSize = MaxL2BlockSize > size ? size : 256;
const int l2BlockSizeAligned = (1 + std::max(l2BlockSize,l2BlockCols)/PacketSize)*PacketSize;
const bool needRhsCopy = (PacketSize>1) && ((rhsStride%PacketSize!=0) || (size_t(rhs)%16!=0));
Scalar* EIGEN_RESTRICT block = new Scalar[l2BlockRows*size];
// for(int i=0; i<l2BlockRows*l2BlockSize; ++i)
// block[i] = 0;
// loops on each L2 cache friendly blocks of lhs
for(int l2k=0; l2k<depth; l2k+=l2BlockSize)
// optimized GEneral packed Block * packed Panel product kernel
template<typename Scalar, typename PacketType, int PacketSize, int mr, int nr, typename Conj>
struct ei_gebp_kernel
{
void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int actual_mc, int actual_kc, int packet_cols, int i2, int cols)
{
for(int l2i=0; l2i<rows; l2i+=l2BlockRows)
Conj cj;
const int peeled_mc = (actual_mc/mr)*mr;
// loops on each cache friendly block of the result/rhs
for(int j2=0; j2<packet_cols; j2+=nr)
{
// We have selected a block of lhs
// Packs this block into 'block'
int count = 0;
for(int k=0; k<l2BlockSize; k+=MaxBlockRows)
// loops on each register blocking of lhs/res
for(int i=0; i<peeled_mc; i+=mr)
{
for(int i=0; i<l2BlockRows; i+=2*PacketSize)
for (int w=0; w<MaxBlockRows; ++w)
for (int y=0; y<2*PacketSize; ++y)
block[count++] = lhs[(k+l2k+w)*lhsStride + l2i+i+ y];
}
const Scalar* blA = &blockA[i*actual_kc];
#ifdef EIGEN_VECTORIZE_SSE
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
#endif
// loops on each L2 cache firendly block of the result/rhs
for(int l2j=0; l2j<cols; l2j+=l2BlockCols)
{
for(int k=0; k<l2BlockSize; k+=MaxBlockRows)
// TODO move the res loads to the stores
// gets res block as register
PacketType C0, C1, C2, C3, C4, C5, C6, C7;
C0 = ei_ploadu(&res[(j2+0)*resStride + i2 + i]);
C1 = ei_ploadu(&res[(j2+1)*resStride + i2 + i]);
if(nr==4) C2 = ei_ploadu(&res[(j2+2)*resStride + i2 + i]);
if(nr==4) C3 = ei_ploadu(&res[(j2+3)*resStride + i2 + i]);
C4 = ei_ploadu(&res[(j2+0)*resStride + i2 + i + PacketSize]);
C5 = ei_ploadu(&res[(j2+1)*resStride + i2 + i + PacketSize]);
if(nr==4) C6 = ei_ploadu(&res[(j2+2)*resStride + i2 + i + PacketSize]);
if(nr==4) C7 = ei_ploadu(&res[(j2+3)*resStride + i2 + i + PacketSize]);
// performs "inner" product
// TODO let's check wether the flowing peeled loop could not be
// optimized via optimal prefetching from one loop to the other
const Scalar* blB = &blockB[j2*actual_kc*PacketSize];
const int peeled_kc = (actual_kc/4)*4;
for(int k=0; k<peeled_kc; k+=4)
{
for(int j=0; j<l2BlockCols; ++j)
PacketType B0, B1, B2, B3, A0, A1;
A0 = ei_pload(&blA[0*PacketSize]);
A1 = ei_pload(&blA[1*PacketSize]);
B0 = ei_pload(&blB[0*PacketSize]);
B1 = ei_pload(&blB[1*PacketSize]);
C0 = cj.pmadd(A0, B0, C0);
if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
C4 = cj.pmadd(A1, B0, C4);
if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]);
C1 = cj.pmadd(A0, B1, C1);
C5 = cj.pmadd(A1, B1, C5);
B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]);
if(nr==4) C2 = cj.pmadd(A0, B2, C2);
if(nr==4) C6 = cj.pmadd(A1, B2, C6);
if(nr==4) B2 = ei_pload(&blB[6*PacketSize]);
if(nr==4) C3 = cj.pmadd(A0, B3, C3);
A0 = ei_pload(&blA[2*PacketSize]);
if(nr==4) C7 = cj.pmadd(A1, B3, C7);
A1 = ei_pload(&blA[3*PacketSize]);
if(nr==4) B3 = ei_pload(&blB[7*PacketSize]);
C0 = cj.pmadd(A0, B0, C0);
C4 = cj.pmadd(A1, B0, C4);
B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]);
C1 = cj.pmadd(A0, B1, C1);
C5 = cj.pmadd(A1, B1, C5);
B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]);
if(nr==4) C2 = cj.pmadd(A0, B2, C2);
if(nr==4) C6 = cj.pmadd(A1, B2, C6);
if(nr==4) B2 = ei_pload(&blB[10*PacketSize]);
if(nr==4) C3 = cj.pmadd(A0, B3, C3);
A0 = ei_pload(&blA[4*PacketSize]);
if(nr==4) C7 = cj.pmadd(A1, B3, C7);
A1 = ei_pload(&blA[5*PacketSize]);
if(nr==4) B3 = ei_pload(&blB[11*PacketSize]);
C0 = cj.pmadd(A0, B0, C0);
C4 = cj.pmadd(A1, B0, C4);
B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]);
C1 = cj.pmadd(A0, B1, C1);
C5 = cj.pmadd(A1, B1, C5);
B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]);
if(nr==4) C2 = cj.pmadd(A0, B2, C2);
if(nr==4) C6 = cj.pmadd(A1, B2, C6);
if(nr==4) B2 = ei_pload(&blB[14*PacketSize]);
if(nr==4) C3 = cj.pmadd(A0, B3, C3);
A0 = ei_pload(&blA[6*PacketSize]);
if(nr==4) C7 = cj.pmadd(A1, B3, C7);
A1 = ei_pload(&blA[7*PacketSize]);
if(nr==4) B3 = ei_pload(&blB[15*PacketSize]);
C0 = cj.pmadd(A0, B0, C0);
C4 = cj.pmadd(A1, B0, C4);
C1 = cj.pmadd(A0, B1, C1);
C5 = cj.pmadd(A1, B1, C5);
if(nr==4) C2 = cj.pmadd(A0, B2, C2);
if(nr==4) C6 = cj.pmadd(A1, B2, C6);
if(nr==4) C3 = cj.pmadd(A0, B3, C3);
if(nr==4) C7 = cj.pmadd(A1, B3, C7);
blB += 4*nr*PacketSize;
blA += 4*mr;
}
// process remaining peeled loop
for(int k=peeled_kc; k<actual_kc; k++)
{
PacketType B0, B1, B2, B3, A0, A1;
A0 = ei_pload(&blA[0*PacketSize]);
A1 = ei_pload(&blA[1*PacketSize]);
B0 = ei_pload(&blB[0*PacketSize]);
B1 = ei_pload(&blB[1*PacketSize]);
C0 = cj.pmadd(A0, B0, C0);
if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
C4 = cj.pmadd(A1, B0, C4);
if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
C1 = cj.pmadd(A0, B1, C1);
C5 = cj.pmadd(A1, B1, C5);
if(nr==4) C2 = cj.pmadd(A0, B2, C2);
if(nr==4) C6 = cj.pmadd(A1, B2, C6);
if(nr==4) C3 = cj.pmadd(A0, B3, C3);
if(nr==4) C7 = cj.pmadd(A1, B3, C7);
blB += nr*PacketSize;
blA += mr;
}
ei_pstoreu(&res[(j2+0)*resStride + i2 + i], C0);
ei_pstoreu(&res[(j2+1)*resStride + i2 + i], C1);
if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i2 + i], C2);
if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i2 + i], C3);
ei_pstoreu(&res[(j2+0)*resStride + i2 + i + PacketSize], C4);
ei_pstoreu(&res[(j2+1)*resStride + i2 + i + PacketSize], C5);
if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i2 + i + PacketSize], C6);
if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i2 + i + PacketSize], C7);
}
for(int i=peeled_mc; i<actual_mc; i++)
{
const Scalar* blA = &blockA[i*actual_kc];
#ifdef EIGEN_VECTORIZE_SSE
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
#endif
// gets a 1 x nr res block as registers
Scalar C0(0), C1(0), C2(0), C3(0);
const Scalar* blB = &blockB[j2*actual_kc*PacketSize];
for(int k=0; k<actual_kc; k++)
{
Scalar B0, B1, B2, B3, A0;
A0 = blA[k];
B0 = blB[0*PacketSize];
B1 = blB[1*PacketSize];
C0 = cj.pmadd(A0, B0, C0);
if(nr==4) B2 = blB[2*PacketSize];
if(nr==4) B3 = blB[3*PacketSize];
C1 = cj.pmadd(A0, B1, C1);
if(nr==4) C2 = cj.pmadd(A0, B2, C2);
if(nr==4) C3 = cj.pmadd(A0, B3, C3);
blB += nr*PacketSize;
}
res[(j2+0)*resStride + i2 + i] += C0;
res[(j2+1)*resStride + i2 + i] += C1;
if(nr==4) res[(j2+2)*resStride + i2 + i] += C2;
if(nr==4) res[(j2+3)*resStride + i2 + i] += C3;
}
}
// remaining rhs/res columns (<nr)
// process remaining rhs/res columns one at a time
// => do the same but with nr==1
for(int j2=packet_cols; j2<cols; j2++)
{
for(int i=0; i<peeled_mc; i+=mr)
{
const Scalar* blA = &blockA[i*actual_kc];
#ifdef EIGEN_VECTORIZE_SSE
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
#endif
// TODO move the res loads to the stores
// gets res block as register
PacketType C0, C4;
C0 = ei_ploadu(&res[(j2+0)*resStride + i2 + i]);
C4 = ei_ploadu(&res[(j2+0)*resStride + i2 + i + PacketSize]);
const Scalar* blB = &blockB[j2*actual_kc*PacketSize];
for(int k=0; k<actual_kc; k++)
{
PacketType B0, A0, A1;
A0 = ei_pload(&blA[0*PacketSize]);
A1 = ei_pload(&blA[1*PacketSize]);
B0 = ei_pload(&blB[0*PacketSize]);
C0 = cj.pmadd(A0, B0, C0);
C4 = cj.pmadd(A1, B0, C4);
blB += PacketSize;
blA += mr;
}
ei_pstoreu(&res[(j2+0)*resStride + i2 + i], C0);
ei_pstoreu(&res[(j2+0)*resStride + i2 + i + PacketSize], C4);
}
for(int i=peeled_mc; i<actual_mc; i++)
{
const Scalar* blA = &blockA[i*actual_kc];
#ifdef EIGEN_VECTORIZE_SSE
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
#endif
// gets a 1 x 1 res block as registers
Scalar C0(0);
const Scalar* blB = &blockB[j2*actual_kc*PacketSize];
for(int k=0; k<actual_kc; k++)
C0 = cj.pmadd(blA[k], blB[k*PacketSize], C0);
res[(j2+0)*resStride + i2 + i] += C0;
}
}
}
};
// pack a block of the lhs
template<typename Scalar, int mr>
struct ei_gemm_pack_lhs
{
void operator()(Scalar* blockA, const Scalar* lhs, int lhsStride, bool lhsRowMajor, int actual_kc, int actual_mc, int k2, int i2)
{
int count = 0;
const int peeled_mc = (actual_mc/mr)*mr;
if (lhsRowMajor)
{
for(int i=0; i<peeled_mc; i+=mr)
for(int k=0; k<actual_kc; k++)
for(int w=0; w<mr; w++)
blockA[count++] = lhs[(k2+k) + (i2+i+w)*lhsStride];
for(int i=peeled_mc; i<actual_mc; i++)
{
const Scalar* llhs = &lhs[(k2) + (i2+i)*lhsStride];
for(int k=0; k<actual_kc; k++)
blockA[count++] = llhs[k];
}
}
else
{
for(int i=0; i<peeled_mc; i+=mr)
for(int k=0; k<actual_kc; k++)
for(int w=0; w<mr; w++)
blockA[count++] = lhs[(k2+k)*lhsStride + i2+i+w];
for(int i=peeled_mc; i<actual_mc; i++)
for(int k=0; k<actual_kc; k++)
blockA[count++] = lhs[(k2+k)*lhsStride + i2+i];
}
}
};
// copy a complete panel of the rhs while expending each coefficient into a packet form
template<typename Scalar, int PacketSize, int nr>
struct ei_gemm_pack_rhs
{
void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, bool hasAlpha, Scalar alpha, int actual_kc, int packet_cols, int k2, int cols)
{
int count = 0;
for(int j2=0; j2<packet_cols; j2+=nr)
{
const Scalar* b0 = &rhs[(j2+0)*rhsStride + k2];
const Scalar* b1 = &rhs[(j2+1)*rhsStride + k2];
const Scalar* b2 = &rhs[(j2+2)*rhsStride + k2];
const Scalar* b3 = &rhs[(j2+3)*rhsStride + k2];
if (hasAlpha)
{
for(int k=0; k<actual_kc; k++)
{
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[k]));
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*b1[k]));
if (nr==4)
{
PacketType A0, A1, A2, A3, A4, A5, A6, A7;
// Load the packets from rhs and reorder them
// Here we need some vector reordering
// Right now its hardcoded to packets of 4 elements
const Scalar* lrhs = &rhs[(j+l2j)*rhsStride+(k+l2k)];
A0 = ei_pset1(lrhs[0]);
A1 = ei_pset1(lrhs[1]);
A2 = ei_pset1(lrhs[2]);
A3 = ei_pset1(lrhs[3]);
if (MaxBlockRows==8)
{
A4 = ei_pset1(lrhs[4]);
A5 = ei_pset1(lrhs[5]);
A6 = ei_pset1(lrhs[6]);
A7 = ei_pset1(lrhs[7]);
}
Scalar * lb = &block[l2BlockRows * k];
for(int i=0; i<l2BlockRows; i+=2*PacketSize)
{
PacketType R0, R1, L0, L1, T0, T1;
// We perform "cross products" of vectors to avoid
// reductions (horizontal ops) afterwards
T0 = ei_pload(&res[(j+l2j)*resStride+l2i+i]);
T1 = ei_pload(&res[(j+l2j)*resStride+l2i+i+PacketSize]);
R0 = ei_pload(&lb[0*PacketSize]);
L0 = ei_pload(&lb[1*PacketSize]);
R1 = ei_pload(&lb[2*PacketSize]);
L1 = ei_pload(&lb[3*PacketSize]);
T0 = cj.pmadd(A0, R0, T0);
T1 = cj.pmadd(A0, L0, T1);
R0 = ei_pload(&lb[4*PacketSize]);
L0 = ei_pload(&lb[5*PacketSize]);
T0 = cj.pmadd(A1, R1, T0);
T1 = cj.pmadd(A1, L1, T1);
R1 = ei_pload(&lb[6*PacketSize]);
L1 = ei_pload(&lb[7*PacketSize]);
T0 = cj.pmadd(A2, R0, T0);
T1 = cj.pmadd(A2, L0, T1);
if(MaxBlockRows==8)
{
R0 = ei_pload(&lb[8*PacketSize]);
L0 = ei_pload(&lb[9*PacketSize]);
}
T0 = cj.pmadd(A3, R1, T0);
T1 = cj.pmadd(A3, L1, T1);
if(MaxBlockRows==8)
{
R1 = ei_pload(&lb[10*PacketSize]);
L1 = ei_pload(&lb[11*PacketSize]);
T0 = cj.pmadd(A4, R0, T0);
T1 = cj.pmadd(A4, L0, T1);
R0 = ei_pload(&lb[12*PacketSize]);
L0 = ei_pload(&lb[13*PacketSize]);
T0 = cj.pmadd(A5, R1, T0);
T1 = cj.pmadd(A5, L1, T1);
R1 = ei_pload(&lb[14*PacketSize]);
L1 = ei_pload(&lb[15*PacketSize]);
T0 = cj.pmadd(A6, R0, T0);
T1 = cj.pmadd(A6, L0, T1);
T0 = cj.pmadd(A7, R1, T0);
T1 = cj.pmadd(A7, L1, T1);
}
lb += MaxBlockRows*2*PacketSize;
ei_pstore(&res[(j+l2j)*resStride+l2i+i], T0);
ei_pstore(&res[(j+l2j)*resStride+l2i+i+PacketSize], T1);
}
ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*b2[k]));
ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*b3[k]));
}
count += nr*PacketSize;
}
}
else
{
for(int k=0; k<actual_kc; k++)
{
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[k]));
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(b1[k]));
if (nr==4)
{
ei_pstore(&blockB[count+2*PacketSize], ei_pset1(b2[k]));
ei_pstore(&blockB[count+3*PacketSize], ei_pset1(b3[k]));
}
count += nr*PacketSize;
}
}
}
// copy the remaining columns one at a time (nr==1)
for(int j2=packet_cols; j2<cols; ++j2)
{
const Scalar* b0 = &rhs[(j2+0)*rhsStride + k2];
if (hasAlpha)
{
for(int k=0; k<actual_kc; k++)
{
ei_pstore(&blockB[count], ei_pset1(alpha*b0[k]));
count += PacketSize;
}
}
else
{
for(int k=0; k<actual_kc; k++)
{
ei_pstore(&blockB[count], ei_pset1(b0[k]));
count += PacketSize;
}
}
}
}
delete[] block;
#endif
}
};
#endif // EIGEN_EXTERN_INSTANTIATIONS

View File

@ -0,0 +1,184 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2009 Gael Guennebaud <g.gael@free.fr>
//
// Eigen is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 3 of the License, or (at your option) any later version.
//
// Alternatively, you can redistribute it and/or
// modify it under the terms of the GNU General Public License as
// published by the Free Software Foundation; either version 2 of
// the License, or (at your option) any later version.
//
// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License and a copy of the GNU General Public License along with
// Eigen. If not, see <http://www.gnu.org/licenses/>.
#ifndef EIGEN_SELFADJOINT_MATRIX_MATRIX_H
#define EIGEN_SELFADJOINT_MATRIX_MATRIX_H
template<typename Scalar, int mr>
struct ei_symm_pack_lhs
{
void operator()(Scalar* blockA, const Scalar* lhs, int lhsStride, bool lhsRowMajor, int actual_kc, int actual_mc, int k2, int i2)
{
int count = 0;
const int peeled_mc = (actual_mc/mr)*mr;
if (lhsRowMajor)
{
for(int i=0; i<peeled_mc; i+=mr)
for(int k=0; k<actual_kc; k++)
for(int w=0; w<mr; w++)
blockA[count++] = lhs[(k2+k) + (i2+i+w)*lhsStride];
for(int i=peeled_mc; i<actual_mc; i++)
{
const Scalar* llhs = &lhs[(k2) + (i2+i)*lhsStride];
for(int k=0; k<actual_kc; k++)
blockA[count++] = llhs[k];
}
}
else
{
for(int i=0; i<peeled_mc; i+=mr)
{
for(int k=0; k<i; k++)
for(int w=0; w<mr; w++)
blockA[count++] = lhs[(k2+k)*lhsStride + i2+i+w];
// symmetric copy
int h = 0;
for(int k=i; k<i+mr; k++)
{
// transposed copy
for(int w=0; w<h; w++)
blockA[count++] = lhs[(k2+k) + (i2+i+w)*lhsStride];
for(int w=h; w<mr; w++)
blockA[count++] = lhs[(k2+k)*lhsStride + i2+i+w];
++h;
}
// transposed copy
for(int k=i+mr; k<actual_kc; k++)
for(int w=0; w<mr; w++)
blockA[count++] = lhs[(k2+k) + (i2+i+w)*lhsStride];
}
// do the same with mr==1
for(int i=peeled_mc; i<actual_mc; i++)
{
for(int k=0; k<=i; k++)
blockA[count++] = lhs[(k2+k)*lhsStride + i2+i];
// transposed copy
for(int k=i+1; k<actual_kc; k++)
blockA[count++] = lhs[(k2+k) + (i2+i)*lhsStride];
}
}
}
};
/* Optimized selfadjoint matrix * matrix (_SYMM) product built on top of
* the general matrix matrix product.
*/
template<typename Scalar, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs>
static EIGEN_DONT_INLINE void ei_product_selfadjoint_matrix(
int size,
const Scalar* lhs, int lhsStride,
const Scalar* rhs, int rhsStride, bool rhsRowMajor, int cols,
Scalar* res, int resStride,
Scalar alpha)
{
typedef typename ei_packet_traits<Scalar>::type Packet;
ei_conj_helper<ConjugateLhs,ConjugateRhs> cj;
if (ConjugateRhs)
alpha = ei_conj(alpha);
bool hasAlpha = alpha != Scalar(1);
typedef typename ei_packet_traits<Scalar>::type PacketType;
const bool lhsRowMajor = (StorageOrder==RowMajor);
enum {
PacketSize = sizeof(PacketType)/sizeof(Scalar),
#if (defined __i386__)
HalfRegisterCount = 4,
#else
HalfRegisterCount = 8,
#endif
// register block size along the N direction
nr = HalfRegisterCount/2,
// register block size along the M direction
mr = 2 * PacketSize,
// max cache block size along the K direction
Max_kc = ei_L2_block_traits<EIGEN_TUNE_FOR_CPU_CACHE_SIZE,Scalar>::width,
// max cache block size along the M direction
Max_mc = 2*Max_kc
};
int kc = std::min<int>(Max_kc,size); // cache block size along the K direction
int mc = std::min<int>(Max_mc,size); // cache block size along the M direction
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*PacketSize);
// number of columns which can be processed by packet of nr columns
int packet_cols = (cols/nr)*nr;
for(int k2=0; k2<size; k2+=kc)
{
const int actual_kc = std::min(k2+kc,size)-k2;
// we have selected one row panel of rhs and one column panel of lhs
// pack rhs's panel into a sequential chunk of memory
// and expand each coeff to a constant packet for further reuse
ei_gemm_pack_rhs<Scalar,PacketSize,nr>()(blockB, rhs, rhsStride, hasAlpha, alpha, actual_kc, packet_cols, k2, cols);
// the select lhs's panel has to be split in three different parts:
// 1 - the transposed panel above the diagonal block => transposed packed copy
// 2 - the diagonal block => special packed copy
// 3 - the panel below the diagonal block => generic packed copy
for(int i2=0; i2<k2; i2+=mc)
{
const int actual_mc = std::min(i2+mc,k2)-i2;
// transposed packed copy
ei_gemm_pack_lhs<Scalar,mr>()(blockA, lhs, lhsStride, !lhsRowMajor, actual_kc, actual_mc, k2, i2);
ei_gebp_kernel<Scalar, PacketType, PacketSize, mr, nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> >()
(res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, i2, cols);
}
// the block diagonal
{
const int actual_mc = std::min(k2+kc,size)-k2;
// symmetric packed copy
ei_symm_pack_lhs<Scalar,mr>()(blockA, lhs, lhsStride, lhsRowMajor, actual_kc, actual_mc, k2, k2);
ei_gebp_kernel<Scalar, PacketType, PacketSize, mr, nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> >()
(res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, k2, cols);
}
for(int i2=k2+kc; i2<size; i2+=mc)
{
const int actual_mc = std::min(i2+mc,size)-i2;
ei_gemm_pack_lhs<Scalar,mr>()(blockA, lhs, lhsStride, lhsRowMajor, actual_kc, actual_mc, k2, i2);
ei_gebp_kernel<Scalar, PacketType, PacketSize, mr, nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> >()
(res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, i2, cols);
}
}
ei_aligned_stack_delete(Scalar, blockA, kc*mc);
ei_aligned_stack_delete(Scalar, blockB, kc*cols*PacketSize);
}
#endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_H

View File

@ -46,9 +46,9 @@ template<typename MatrixType> void product_selfadjoint(const MatrixType& m)
Scalar s1 = ei_random<Scalar>(),
s2 = ei_random<Scalar>(),
s3 = ei_random<Scalar>();
m1 = m1.adjoint()*m1;
// lower
m2.setZero();
m2.template triangularView<LowerTriangular>() = m1;
@ -68,7 +68,7 @@ template<typename MatrixType> void product_selfadjoint(const MatrixType& m)
m2 = m1.template triangularView<LowerTriangular>();
m2.template selfadjointView<LowerTriangular>().rank2update(v1,v2);
VERIFY_IS_APPROX(m2, (m1 + v1 * v2.adjoint()+ v2 * v1.adjoint()).template triangularView<LowerTriangular>().toDense());
m2 = m1.template triangularView<UpperTriangular>();
m2.template selfadjointView<UpperTriangular>().rank2update(-v1,s2*v2,s3);
VERIFY_IS_APPROX(m2, (m1 + (-s2*s3) * (v1 * v2.adjoint()+ v2 * v1.adjoint())).template triangularView<UpperTriangular>().toDense());
@ -99,4 +99,24 @@ void test_product_selfadjoint()
CALL_SUBTEST( product_selfadjoint(Matrix<float,Dynamic,Dynamic,RowMajor>(17,17)) );
CALL_SUBTEST( product_selfadjoint(Matrix<std::complex<double>,Dynamic,Dynamic,RowMajor>(19, 19)) );
}
for(int i = 0; i < g_repeat ; i++)
{
int size = ei_random<int>(10,1024);
int cols = ei_random<int>(10,320);
MatrixXf A = MatrixXf::Random(size,size);
MatrixXf B = MatrixXf::Random(size,cols);
MatrixXf C = MatrixXf::Random(size,cols);
MatrixXf R = MatrixXf::Random(size,cols);
A = (A+A.transpose()).eval();
R = C + (A * B).eval();
A.corner(TopRight,size-1,size-1).triangularView<UpperTriangular>().setZero();
ei_product_selfadjoint_matrix<float,ColMajor,LowerTriangular,false,false>
(size, A.data(), A.stride(), B.data(), B.stride(), false, B.cols(), C.data(), C.stride(), 1);
// std::cerr << A << "\n\n" << C << "\n\n" << R << "\n\n";
VERIFY_IS_APPROX(C,R);
}
}