optimize linear vectorization both in Assign and Sum (optimal amortized perf)

This commit is contained in:
Gael Guennebaud 2008-06-23 15:50:28 +00:00
parent ea1990ef3d
commit ac9aa47bbc
2 changed files with 24 additions and 28 deletions

View File

@ -307,12 +307,17 @@ struct ei_assign_impl<Derived1, Derived2, LinearVectorization, NoUnrolling>
int index = 0; int index = 0;
// do the vectorizable part of the assignment // do the vectorizable part of the assignment
for ( ; index<alignedSize ; index+=packetSize) int row = 0;
int col = 0;
while (index<alignedSize)
{ {
// FIXME the following is not really efficient int start = rowMajor ? col : row;
const int row = rowMajor ? index/innerSize : index%innerSize; int end = std::min(innerSize, start + alignedSize-index);
const int col = rowMajor ? index%innerSize : index/innerSize; for ( ; (rowMajor ? col : row)<end; (rowMajor ? col : row)+=packetSize)
dst.template writePacket<Aligned>(row, col, src.template packet<Aligned>(row, col)); dst.template writePacket<Aligned>(row, col, src.template packet<Aligned>(row, col));
index += (rowMajor ? col : row) - start;
row = rowMajor ? index/innerSize : index%innerSize;
col = rowMajor ? index%innerSize : index/innerSize;
} }
// now we must do the rest without vectorization. // now we must do the rest without vectorization.
@ -380,7 +385,7 @@ struct ei_assign_impl<Derived1, Derived2, SliceVectorization, NoUnrolling>
const int innerSize = rowMajor ? dst.cols() : dst.rows(); const int innerSize = rowMajor ? dst.cols() : dst.rows();
const int outerSize = rowMajor ? dst.rows() : dst.cols(); const int outerSize = rowMajor ? dst.rows() : dst.cols();
const int alignedInnerSize = (innerSize/packetSize)*packetSize; const int alignedInnerSize = (innerSize/packetSize)*packetSize;
for(int i = 0; i < outerSize; i++) for(int i = 0; i < outerSize; i++)
{ {
// do the vectorizable part of the assignment // do the vectorizable part of the assignment

View File

@ -54,7 +54,7 @@ public:
Unrolling = Cost <= UnrollingLimit Unrolling = Cost <= UnrollingLimit
? CompleteUnrolling ? CompleteUnrolling
: NoUnrolling : NoUnrolling
}; };
}; };
/*************************************************************************** /***************************************************************************
@ -62,7 +62,7 @@ public:
***************************************************************************/ ***************************************************************************/
/*** no vectorization ***/ /*** no vectorization ***/
template<typename Derived, int Start, int Length> template<typename Derived, int Start, int Length>
struct ei_sum_novec_unroller struct ei_sum_novec_unroller
{ {
@ -194,32 +194,23 @@ struct ei_sum_impl<Derived, LinearVectorization, NoUnrolling>
// do the vectorizable part of the sum // do the vectorizable part of the sum
if(size >= packetSize) if(size >= packetSize)
{ {
asm("#begin");
PacketScalar packet_res; PacketScalar packet_res;
packet_res = mat.template packet<Aligned>(0, 0); packet_res = mat.template packet<Aligned>(0, 0);
int index; int row = 0;
if(Derived::IsVectorAtCompileTime) int col = 0;
int index = packetSize;
while (index<alignedSize)
{ {
for(index = packetSize; index<alignedSize ; index+=packetSize) row = rowMajor ? index/innerSize : index%innerSize;
{ col = rowMajor ? index%innerSize : index/innerSize;
const int row = Derived::RowsAtCompileTime==1 ? 0 : index; int start = rowMajor ? col : row;
const int col = Derived::RowsAtCompileTime==1 ? index : 0; int end = std::min(innerSize, start+alignedSize-index);
if (end<start) getchar();
for ( ; (rowMajor ? col : row)<end; (rowMajor ? col : row)+=packetSize)
packet_res = ei_padd(packet_res, mat.template packet<Aligned>(row, col)); packet_res = ei_padd(packet_res, mat.template packet<Aligned>(row, col));
} index += (rowMajor ? col : row) - start;
}
else
{
for(index = packetSize; index<alignedSize ; index+=packetSize)
{
// FIXME the following is not really efficient
const int row = rowMajor ? index/innerSize : index%innerSize;
const int col = rowMajor ? index%innerSize : index/innerSize;
packet_res = ei_padd(packet_res, mat.template packet<Aligned>(row, col));
}
} }
res = ei_predux(packet_res); res = ei_predux(packet_res);
asm("#end");
// now we must do the rest without vectorization. // now we must do the rest without vectorization.
if(alignedSize == size) return res; if(alignedSize == size) return res;