diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h index 5b15b4ee9..b7cc7c0e9 100644 --- a/Eigen/src/Core/AssignEvaluator.h +++ b/Eigen/src/Core/AssignEvaluator.h @@ -88,10 +88,11 @@ private: /* If the destination isn't aligned, we have to do runtime checks and we don't unroll, so it's only good for large enough sizes. */ MaySliceVectorize = bool(MightVectorize) && bool(DstHasDirectAccess) - && (int(InnerMaxSize)==Dynamic || int(InnerMaxSize)>=3*InnerPacketSize) + && (int(InnerMaxSize)==Dynamic || int(InnerMaxSize)>=(EIGEN_UNALIGNED_VECTORIZE?InnerPacketSize:(3*InnerPacketSize))) /* slice vectorization can be slow, so we only want it if the slices are big, which is indicated by InnerMaxSize rather than InnerSize, think of the case of a dynamic block - in a fixed-size matrix */ + in a fixed-size matrix + However, with EIGEN_UNALIGNED_VECTORIZE and unrolling, slice vectorization is still worth it */ }; public: @@ -136,6 +137,11 @@ public: : int(Traversal) == int(LinearTraversal) ? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling) : int(NoUnrolling) ) +#if EIGEN_UNALIGNED_VECTORIZE + : int(Traversal) == int(SliceVectorizedTraversal) + ? ( bool(MayUnrollInner) ? int(InnerUnrolling) + : int(NoUnrolling) ) +#endif : int(NoUnrolling) }; @@ -277,24 +283,20 @@ struct copy_using_evaluator_innervec_CompleteUnrolling EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&) { } }; -template +template struct copy_using_evaluator_innervec_InnerUnrolling { typedef typename Kernel::PacketType PacketType; - enum { - SrcAlignment = Kernel::AssignmentTraits::SrcAlignment, - DstAlignment = Kernel::AssignmentTraits::DstAlignment - }; EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel, Index outer) { kernel.template assignPacketByOuterInner(outer, Index_); enum { NextIndex = Index_ + unpacket_traits::size }; - copy_using_evaluator_innervec_InnerUnrolling::run(kernel, outer); + copy_using_evaluator_innervec_InnerUnrolling::run(kernel, outer); } }; -template -struct copy_using_evaluator_innervec_InnerUnrolling +template +struct copy_using_evaluator_innervec_InnerUnrolling { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &, Index) { } }; @@ -423,9 +425,10 @@ struct dense_assignment_loop::size, + packetSize =unpacket_traits::size, alignedSize = (size/packetSize)*packetSize }; copy_using_evaluator_innervec_CompleteUnrolling::run(kernel); @@ -472,9 +475,11 @@ struct dense_assignment_loop EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) { typedef typename Kernel::DstEvaluatorType::XprType DstXprType; + typedef typename Kernel::AssignmentTraits Traits; const Index outerSize = kernel.outerSize(); for(Index outer = 0; outer < outerSize; ++outer) - copy_using_evaluator_innervec_InnerUnrolling::run(kernel, outer); + copy_using_evaluator_innervec_InnerUnrolling::run(kernel, outer); } }; @@ -554,6 +559,29 @@ struct dense_assignment_loop } }; +#if EIGEN_UNALIGNED_VECTORIZE +template +struct dense_assignment_loop +{ + EIGEN_DEVICE_FUNC static inline void run(Kernel &kernel) + { + typedef typename Kernel::DstEvaluatorType::XprType DstXprType; + typedef typename Kernel::PacketType PacketType; + + enum { size = DstXprType::InnerSizeAtCompileTime, + packetSize =unpacket_traits::size, + vectorizableSize = (size/packetSize)*packetSize }; + + for(Index outer = 0; outer < kernel.outerSize(); ++outer) + { + copy_using_evaluator_innervec_InnerUnrolling::run(kernel, outer); + copy_using_evaluator_DefaultTraversal_InnerUnrolling::run(kernel, outer); + } + } +}; +#endif + + /*************************************************************************** * Part 4 : Generic dense assignment kernel ***************************************************************************/ @@ -681,7 +709,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(const DstX typedef generic_dense_assignment_kernel Kernel; Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived()); - + dense_assignment_loop::run(kernel); }