Rewrite the vectorized meta unroller of sum to reduce instruction

dependency => significant speed up
This commit is contained in:
Gael Guennebaud 2009-01-17 09:48:58 +00:00
parent e556e647f4
commit 1eec38dc36

View File

@ -100,18 +100,13 @@ struct ei_sum_novec_unroller<Derived, Start, 1>
}; };
/*** vectorization ***/ /*** vectorization ***/
template<typename Derived, int Index, int Stop, template<typename Derived, int Start, int Length>
bool LastPacket = (Stop-Index == ei_packet_traits<typename Derived::Scalar>::size)>
struct ei_sum_vec_unroller struct ei_sum_vec_unroller
{ {
enum { enum {
row = int(Derived::Flags)&RowMajorBit PacketSize = ei_packet_traits<typename Derived::Scalar>::size,
? Index / int(Derived::ColsAtCompileTime) HalfLength = Length/2
: Index % Derived::RowsAtCompileTime,
col = int(Derived::Flags)&RowMajorBit
? Index % int(Derived::ColsAtCompileTime)
: Index / Derived::RowsAtCompileTime
}; };
typedef typename Derived::Scalar Scalar; typedef typename Derived::Scalar Scalar;
@ -120,22 +115,22 @@ struct ei_sum_vec_unroller
inline static PacketScalar run(const Derived &mat) inline static PacketScalar run(const Derived &mat)
{ {
return ei_padd( return ei_padd(
mat.template packet<Aligned>(row, col), ei_sum_vec_unroller<Derived, Start, HalfLength>::run(mat),
ei_sum_vec_unroller<Derived, Index+ei_packet_traits<typename Derived::Scalar>::size, Stop>::run(mat) ei_sum_vec_unroller<Derived, Start+HalfLength, Length-HalfLength>::run(mat) );
);
} }
}; };
template<typename Derived, int Index, int Stop> template<typename Derived, int Start>
struct ei_sum_vec_unroller<Derived, Index, Stop, true> struct ei_sum_vec_unroller<Derived, Start, 1>
{ {
enum { enum {
index = Start * ei_packet_traits<typename Derived::Scalar>::size,
row = int(Derived::Flags)&RowMajorBit row = int(Derived::Flags)&RowMajorBit
? Index / int(Derived::ColsAtCompileTime) ? index / int(Derived::ColsAtCompileTime)
: Index % Derived::RowsAtCompileTime, : index % Derived::RowsAtCompileTime,
col = int(Derived::Flags)&RowMajorBit col = int(Derived::Flags)&RowMajorBit
? Index % int(Derived::ColsAtCompileTime) ? index % int(Derived::ColsAtCompileTime)
: Index / Derived::RowsAtCompileTime, : index / Derived::RowsAtCompileTime,
alignment = (Derived::Flags & AlignedBit) ? Aligned : Unaligned alignment = (Derived::Flags & AlignedBit) ? Aligned : Unaligned
}; };
@ -238,7 +233,7 @@ struct ei_sum_impl<Derived, LinearVectorization, CompleteUnrolling>
}; };
static Scalar run(const Derived& mat) static Scalar run(const Derived& mat)
{ {
Scalar res = ei_predux(ei_sum_vec_unroller<Derived, 0, VectorizationSize>::run(mat)); Scalar res = ei_predux(ei_sum_vec_unroller<Derived, 0, Size / PacketSize>::run(mat));
if (VectorizationSize != Size) if (VectorizationSize != Size)
res += ei_sum_novec_unroller<Derived, VectorizationSize, Size-VectorizationSize>::run(mat); res += ei_sum_novec_unroller<Derived, VectorizationSize, Size-VectorizationSize>::run(mat);
return res; return res;