diff --git a/Eigen/src/Core/CacheFriendlyProduct.h b/Eigen/src/Core/CacheFriendlyProduct.h index 06b3f5876..669649de8 100644 --- a/Eigen/src/Core/CacheFriendlyProduct.h +++ b/Eigen/src/Core/CacheFriendlyProduct.h @@ -460,7 +460,7 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector( _EIGEN_ACCUMULATE_PACKETS(,u,u,); break; default: - for (int j = peeledSize; j +EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector( + const Scalar* lhs, int lhsStride, + const Scalar* rhs, int rhsSize, + ResType& res) +{ + #ifdef _EIGEN_ACCUMULATE_PACKETS + #error _EIGEN_ACCUMULATE_PACKETS has already been defined + #endif + + #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2,OFFSET) {\ + Packet b = ei_pload(&rhs[j]); \ + ptmp0 = ei_padd(ptmp0, ei_pmul(b, ei_pload##A0 (&lhs[j+iN0]))); \ + ptmp1 = ei_padd(ptmp1, ei_pmul(b, ei_pload##A13(&lhs[j+iN1]))); \ + ptmp2 = ei_padd(ptmp2, ei_pmul(b, ei_pload##A2 (&lhs[j+iN2]))); \ + ptmp3 = ei_padd(ptmp3, ei_pmul(b, ei_pload##A13(&lhs[j+iN3]))); } + + asm("#begin matrix_vector_product"); + typedef typename ei_packet_traits::type Packet; + const int PacketSize = sizeof(Packet)/sizeof(Scalar); + + enum { AllAligned, EvenAligned, FirstAligned, NoneAligned }; + const int rowsAtOnce = 4; + const int peels = 2; + const int PacketAlignedMask = PacketSize-1; + const int PeelAlignedMask = PacketSize*peels-1; + const bool Vectorized = sizeof(Packet) != sizeof(Scalar); + const int size = rhsSize; + + // How many coeffs of the result do we have to skip to be aligned. + // Here we assume data are at least aligned on the base scalar type that is mandatory anyway. + const int alignedStart = Vectorized + ? std::min( (PacketSize - ((size_t(rhs)/sizeof(Scalar)) & PacketAlignedMask)) & PacketAlignedMask, size) + : 0; + const int alignedSize = alignedStart + ((size-alignedStart) & ~PacketAlignedMask); + const int peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : 0; + + const int alignmentStep = lhsStride % PacketSize; + int alignmentPattern = alignmentStep==0 ? AllAligned + : alignmentStep==2 ? EvenAligned + : FirstAligned; + + // find how many rows do we have to skip to be aligned with rhs (if possible) + int skipRows=0; + for (; skipRowsalignedStart) + { + switch(alignmentPattern) + { + case AllAligned: + for (int j = alignedStart; jalignedStart) + { + // process aligned rhs coeffs + if (iN0 % PacketSize==0) + for (int j = alignedStart;j class ei_matrix_storage */ template struct ei_product_mode { - enum{ value = ((Rhs::Flags&Diagonal)==Diagonal) || ((Lhs::Flags&Diagonal)==Diagonal) - ? DiagonalProduct - : (Rhs::Flags & Lhs::Flags & SparseBit) - ? SparseProduct - : Lhs::MaxColsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD - && ( Lhs::MaxRowsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD - || ((Rhs::Flags&RowMajorBit) && Lhs::IsVectorAtCompileTime)) - && ( Rhs::MaxColsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD - || ((!(Lhs::Flags&RowMajorBit)) && Rhs::IsVectorAtCompileTime)) - ? CacheFriendlyProduct : NormalProduct }; + enum{ + + value = ((Rhs::Flags&Diagonal)==Diagonal) || ((Lhs::Flags&Diagonal)==Diagonal) + ? DiagonalProduct + : (Rhs::Flags & Lhs::Flags & SparseBit) + ? SparseProduct + : Lhs::MaxColsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD + && ( Lhs::MaxRowsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD + || Rhs::MaxColsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ) + && (!(Rhs::IsVectorAtCompileTime && (Lhs::Flags&RowMajorBit) && (!Lhs::Flags&DirectAccessBit))) + && (!(Lhs::IsVectorAtCompileTime && (!Rhs::Flags&RowMajorBit) && (!Rhs::Flags&DirectAccessBit))) + ? CacheFriendlyProduct + : NormalProduct }; }; /** \class Product @@ -210,11 +213,9 @@ template class Product */ inline bool _useCacheFriendlyProduct() const { - return m_lhs.cols()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD - && (rows()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD - || ((_RhsNested::Flags&RowMajorBit) && _LhsNested::IsVectorAtCompileTime)) - && (cols()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD - || ((!(_LhsNested::Flags&RowMajorBit)) && _RhsNested::IsVectorAtCompileTime)); + return m_lhs.cols()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD + && ( rows()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD + || cols()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD); } inline int rows() const { return m_lhs.rows(); } @@ -365,7 +366,6 @@ struct ei_product_coeff_impl } }; -// NOTE the following specializations are because taking .col(0) on a vector is a bit slower template struct ei_product_coeff_vectorized_dyn_selector { @@ -378,6 +378,7 @@ struct ei_product_coeff_vectorized_dyn_selector } }; +// NOTE the 2 following specializations are because taking .col(0) on a vector is a bit slower template struct ei_product_coeff_vectorized_dyn_selector { @@ -483,6 +484,10 @@ template static void ei_cache_friendly_product_colmajor_times_vector( int size, const Scalar* lhs, int lhsStride, const RhsType& rhs, Scalar* res); +template +static void ei_cache_friendly_product_rowmajor_times_vector( + const Scalar* lhs, int lhsStride, const Scalar* rhs, int rhsSize, ResType& res); + template::RowsAtCompileTime, int LhsOrder = int(ei_traits::LhsFlags)&RowMajorBit ? RowMajor : ColMajor, @@ -538,7 +543,7 @@ struct ei_cache_friendly_product_selector >(_res, res.size()); + res = Map >(_res, res.size()); } }; @@ -574,17 +579,82 @@ struct ei_cache_friendly_product_selector >(_res, res.size()) = res; + Map >(_res, res.size()) = res; } ei_cache_friendly_product_colmajor_times_vector(res.size(), &product.rhs().const_cast_derived().coeffRef(0,0), product.rhs().stride(), product.lhs().transpose(), _res); if (!EvalToRes) - res = Map >(_res, res.size()); + res = Map >(_res, res.size()); } }; +// optimized rowmajor - vector product +template +struct ei_cache_friendly_product_selector +{ + typedef typename ProductType::Scalar Scalar; + typedef typename ei_traits::_RhsNested Rhs; + enum { + UseRhsDirectly = ((ei_packet_traits::size==1) || (Rhs::Flags&ActualPacketAccessBit)) + && (!(Rhs::Flags & RowMajorBit)) }; + + template + inline static void run(DestDerived& res, const ProductType& product) + { + Scalar* __restrict__ _rhs; + if (UseRhsDirectly) + _rhs = &product.rhs().const_cast_derived().coeffRef(0); + else + { + _rhs = (Scalar*)alloca(sizeof(Scalar)*product.rhs().size()); + Map >(_rhs, product.rhs().size()) = product.rhs(); + } + ei_cache_friendly_product_rowmajor_times_vector(&product.lhs().const_cast_derived().coeffRef(0,0), product.lhs().stride(), + _rhs, product.rhs().size(), res); + } +}; + +// optimized vector - colmajor product +template +struct ei_cache_friendly_product_selector +{ + typedef typename ProductType::Scalar Scalar; + typedef typename ei_traits::_LhsNested Lhs; + enum { + UseLhsDirectly = ((ei_packet_traits::size==1) || (Lhs::Flags&ActualPacketAccessBit)) + && (!(Lhs::Flags & RowMajorBit)) }; + + template + inline static void run(DestDerived& res, const ProductType& product) + { + Scalar* __restrict__ _lhs; + if (UseLhsDirectly) + _lhs = &product.lhs().const_cast_derived().coeffRef(0); + else + { + _lhs = (Scalar*)alloca(sizeof(Scalar)*product.lhs().size()); + Map >(_lhs, product.lhs().size()) = product.lhs(); + } + ei_cache_friendly_product_rowmajor_times_vector(&product.rhs().const_cast_derived().coeffRef(0,0), product.rhs().stride(), + _lhs, product.lhs().size(), res); + } +}; + +// discard this case which has to be handled by the default path +// (we keep it to be sure to hit a compilation error if this is not the case) +template +struct ei_cache_friendly_product_selector +{}; + +// discard this case which has to be handled by the default path +// (we keep it to be sure to hit a compilation error if this is not the case) +template +struct ei_cache_friendly_product_selector +{}; + + /** \internal */ template template diff --git a/bench/btl/data/perlib_plot_settings.txt b/bench/btl/data/perlib_plot_settings.txt index 8e5784497..3e23fee53 100644 --- a/bench/btl/data/perlib_plot_settings.txt +++ b/bench/btl/data/perlib_plot_settings.txt @@ -5,7 +5,6 @@ mtl4 ; with lines lc rgbcolor "#74B973" lt 1 blitz ; with lines lc rgbcolor "#38F5F5" lt 1 ATLAS ; with lines lc rgbcolor "green" lt 1 INTEL_MKL ; with lines lc rgbcolor "yellow" lt 2 -MKL_INTEL ; with lines lc rgbcolor "yellow" lt 2 ublas ; with lines lc rgbcolor "red" lt 1 F77 ; with lines lc rgbcolor "#9A6B36" lt 1 C ; with lines lc rgbcolor "#7DF4FF" lt 1