From ed134a0ce5670984b67c1797449b8d5b7a9a55f0 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 2 Mar 2009 13:15:15 +0000 Subject: [PATCH] performance improvement: rewrite of the matrix-matrix product following Goto's paper => x1.4 speedup with more consistent perf results --- Eigen/src/Core/products/GeneralMatrixMatrix.h | 576 ++++++++++-------- 1 file changed, 307 insertions(+), 269 deletions(-) diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index 3bf4a2e162..7ec33d7f48 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -69,6 +69,286 @@ static void ei_cache_friendly_product( typedef typename ei_packet_traits::type PacketType; + + +#ifndef EIGEN_USE_ALT_PRODUCT + + enum { + PacketSize = sizeof(PacketType)/sizeof(Scalar), + #if (defined __i386__) + HalfRegisterCount = 4, + #else + HalfRegisterCount = 8, + #endif + + // register block size along the N direction + nr = HalfRegisterCount/2, + + // register block size along the M direction + mr = 2 * PacketSize, + + // max cache block size along the K direction + Max_kc = ei_L2_block_traits::width, + + // max cache block size along the M direction + Max_mc = 2*Max_kc + }; + + int kc = std::min(Max_kc,depth); // cache block size along the K direction + int mc = std::min(Max_mc,rows); // cache block size along the M direction + + Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); + Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*PacketSize); + + // number of columns which can be processed by packet of nr columns + int packet_cols = (cols/nr)*nr; + + // GEMM_VAR1 + for(int k2=0; k2 GEPP_VAR1 + for(int i2=0; i2::width }; - + const bool resIsAligned = (PacketSize==1) || (((resStride%PacketSize) == 0) && (size_t(res)%16==0)); const int remainingSize = depth % PacketSize; const int size = depth - remainingSize; // third dimension of the product clamped to packet boundaries - const int l2BlockRows = MaxL2BlockSize > rows ? rows : MaxL2BlockSize; - const int l2BlockCols = MaxL2BlockSize > cols ? cols : MaxL2BlockSize; - const int l2BlockSize = MaxL2BlockSize > size ? size : MaxL2BlockSize; + + const int l2BlockRows = MaxL2BlockSize > rows ? rows : 512; + const int l2BlockCols = MaxL2BlockSize > cols ? cols : 128; + const int l2BlockSize = MaxL2BlockSize > size ? size : 256; const int l2BlockSizeAligned = (1 + std::max(l2BlockSize,l2BlockCols)/PacketSize)*PacketSize; const bool needRhsCopy = (PacketSize>1) && ((rhsStride%PacketSize!=0) || (size_t(rhs)%16!=0)); - Scalar* EIGEN_RESTRICT block = 0; - const int allocBlockSize = l2BlockRows*size; - block = ei_aligned_stack_new(Scalar, allocBlockSize); - Scalar* EIGEN_RESTRICT rhsCopy - = ei_aligned_stack_new(Scalar, l2BlockSizeAligned*l2BlockSizeAligned); - -#ifndef EIGEN_USE_NEW_PRODUCT - // loops on each L2 cache friendly blocks of the result - for(int l2i=0; l2i0) - { - for (int k=l2k; k1 && resIsAligned) - { - // the result is aligned: let's do packet reduction - ei_pstore(&(localRes[0]), ei_padd(ei_pload(&(localRes[0])), ei_preduxp(&dst[0]))); - if (PacketSize==2) - ei_pstore(&(localRes[2]), ei_padd(ei_pload(&(localRes[2])), ei_preduxp(&(dst[2])))); - if (MaxBlockRows==8) - { - ei_pstore(&(localRes[4]), ei_padd(ei_pload(&(localRes[4])), ei_preduxp(&(dst[4])))); - if (PacketSize==2) - ei_pstore(&(localRes[6]), ei_padd(ei_pload(&(localRes[6])), ei_preduxp(&(dst[6])))); - } - } - else - { - // not aligned => per coeff packet reduction - localRes[0] += ei_predux(dst[0]); - localRes[1] += ei_predux(dst[1]); - localRes[2] += ei_predux(dst[2]); - localRes[3] += ei_predux(dst[3]); - if (MaxBlockRows==8) - { - localRes[4] += ei_predux(dst[4]); - localRes[5] += ei_predux(dst[5]); - localRes[6] += ei_predux(dst[6]); - localRes[7] += ei_predux(dst[7]); - } - } - } - } - if (l2blockRemainingRows>0) - { - int offsetblock = l2k * (l2blockRowEnd-l2i) + (l2blockRowEndBW-l2i)*(l2blockSizeEnd-l2k) - l2k*l2blockRemainingRows; - const Scalar* localB = &block[offsetblock]; - - for(int l1j=l2j; l1j=2) dst[1] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+ PacketSize])), dst[1]); - if (l2blockRemainingRows>=3) dst[2] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+2*PacketSize])), dst[2]); - if (l2blockRemainingRows>=4) dst[3] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+3*PacketSize])), dst[3]); - if (MaxBlockRows==8) - { - if (l2blockRemainingRows>=5) dst[4] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+4*PacketSize])), dst[4]); - if (l2blockRemainingRows>=6) dst[5] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+5*PacketSize])), dst[5]); - if (l2blockRemainingRows>=7) dst[6] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+6*PacketSize])), dst[6]); - if (l2blockRemainingRows>=8) dst[7] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+7*PacketSize])), dst[7]); - } - } - - Scalar* EIGEN_RESTRICT localRes = &(res[l2blockRowEndBW + l1j*resStride]); - - // process the remaining rows once at a time - localRes[0] += ei_predux(dst[0]); - if (l2blockRemainingRows>=2) localRes[1] += ei_predux(dst[1]); - if (l2blockRemainingRows>=3) localRes[2] += ei_predux(dst[2]); - if (l2blockRemainingRows>=4) localRes[3] += ei_predux(dst[3]); - if (MaxBlockRows==8) - { - if (l2blockRemainingRows>=5) localRes[4] += ei_predux(dst[4]); - if (l2blockRemainingRows>=6) localRes[5] += ei_predux(dst[5]); - if (l2blockRemainingRows>=7) localRes[6] += ei_predux(dst[6]); - if (l2blockRemainingRows>=8) localRes[7] += ei_predux(dst[7]); - } - - } - } - } - } - } - if (PacketSize>1 && remainingSize) - { - if (lhsRowMajor) - { - for (int j=0; j