mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-01-24 14:45:14 +08:00
optimize linear vectorization both in Assign and Sum (optimal amortized perf)
This commit is contained in:
parent
ea1990ef3d
commit
ac9aa47bbc
@ -307,12 +307,17 @@ struct ei_assign_impl<Derived1, Derived2, LinearVectorization, NoUnrolling>
|
|||||||
int index = 0;
|
int index = 0;
|
||||||
|
|
||||||
// do the vectorizable part of the assignment
|
// do the vectorizable part of the assignment
|
||||||
for ( ; index<alignedSize ; index+=packetSize)
|
int row = 0;
|
||||||
|
int col = 0;
|
||||||
|
while (index<alignedSize)
|
||||||
{
|
{
|
||||||
// FIXME the following is not really efficient
|
int start = rowMajor ? col : row;
|
||||||
const int row = rowMajor ? index/innerSize : index%innerSize;
|
int end = std::min(innerSize, start + alignedSize-index);
|
||||||
const int col = rowMajor ? index%innerSize : index/innerSize;
|
for ( ; (rowMajor ? col : row)<end; (rowMajor ? col : row)+=packetSize)
|
||||||
dst.template writePacket<Aligned>(row, col, src.template packet<Aligned>(row, col));
|
dst.template writePacket<Aligned>(row, col, src.template packet<Aligned>(row, col));
|
||||||
|
index += (rowMajor ? col : row) - start;
|
||||||
|
row = rowMajor ? index/innerSize : index%innerSize;
|
||||||
|
col = rowMajor ? index%innerSize : index/innerSize;
|
||||||
}
|
}
|
||||||
|
|
||||||
// now we must do the rest without vectorization.
|
// now we must do the rest without vectorization.
|
||||||
@ -380,7 +385,7 @@ struct ei_assign_impl<Derived1, Derived2, SliceVectorization, NoUnrolling>
|
|||||||
const int innerSize = rowMajor ? dst.cols() : dst.rows();
|
const int innerSize = rowMajor ? dst.cols() : dst.rows();
|
||||||
const int outerSize = rowMajor ? dst.rows() : dst.cols();
|
const int outerSize = rowMajor ? dst.rows() : dst.cols();
|
||||||
const int alignedInnerSize = (innerSize/packetSize)*packetSize;
|
const int alignedInnerSize = (innerSize/packetSize)*packetSize;
|
||||||
|
|
||||||
for(int i = 0; i < outerSize; i++)
|
for(int i = 0; i < outerSize; i++)
|
||||||
{
|
{
|
||||||
// do the vectorizable part of the assignment
|
// do the vectorizable part of the assignment
|
||||||
|
@ -54,7 +54,7 @@ public:
|
|||||||
Unrolling = Cost <= UnrollingLimit
|
Unrolling = Cost <= UnrollingLimit
|
||||||
? CompleteUnrolling
|
? CompleteUnrolling
|
||||||
: NoUnrolling
|
: NoUnrolling
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
@ -62,7 +62,7 @@ public:
|
|||||||
***************************************************************************/
|
***************************************************************************/
|
||||||
|
|
||||||
/*** no vectorization ***/
|
/*** no vectorization ***/
|
||||||
|
|
||||||
template<typename Derived, int Start, int Length>
|
template<typename Derived, int Start, int Length>
|
||||||
struct ei_sum_novec_unroller
|
struct ei_sum_novec_unroller
|
||||||
{
|
{
|
||||||
@ -194,32 +194,23 @@ struct ei_sum_impl<Derived, LinearVectorization, NoUnrolling>
|
|||||||
// do the vectorizable part of the sum
|
// do the vectorizable part of the sum
|
||||||
if(size >= packetSize)
|
if(size >= packetSize)
|
||||||
{
|
{
|
||||||
asm("#begin");
|
|
||||||
|
|
||||||
PacketScalar packet_res;
|
PacketScalar packet_res;
|
||||||
packet_res = mat.template packet<Aligned>(0, 0);
|
packet_res = mat.template packet<Aligned>(0, 0);
|
||||||
int index;
|
int row = 0;
|
||||||
if(Derived::IsVectorAtCompileTime)
|
int col = 0;
|
||||||
|
int index = packetSize;
|
||||||
|
while (index<alignedSize)
|
||||||
{
|
{
|
||||||
for(index = packetSize; index<alignedSize ; index+=packetSize)
|
row = rowMajor ? index/innerSize : index%innerSize;
|
||||||
{
|
col = rowMajor ? index%innerSize : index/innerSize;
|
||||||
const int row = Derived::RowsAtCompileTime==1 ? 0 : index;
|
int start = rowMajor ? col : row;
|
||||||
const int col = Derived::RowsAtCompileTime==1 ? index : 0;
|
int end = std::min(innerSize, start+alignedSize-index);
|
||||||
|
if (end<start) getchar();
|
||||||
|
for ( ; (rowMajor ? col : row)<end; (rowMajor ? col : row)+=packetSize)
|
||||||
packet_res = ei_padd(packet_res, mat.template packet<Aligned>(row, col));
|
packet_res = ei_padd(packet_res, mat.template packet<Aligned>(row, col));
|
||||||
}
|
index += (rowMajor ? col : row) - start;
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
for(index = packetSize; index<alignedSize ; index+=packetSize)
|
|
||||||
{
|
|
||||||
// FIXME the following is not really efficient
|
|
||||||
const int row = rowMajor ? index/innerSize : index%innerSize;
|
|
||||||
const int col = rowMajor ? index%innerSize : index/innerSize;
|
|
||||||
packet_res = ei_padd(packet_res, mat.template packet<Aligned>(row, col));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
res = ei_predux(packet_res);
|
res = ei_predux(packet_res);
|
||||||
asm("#end");
|
|
||||||
|
|
||||||
// now we must do the rest without vectorization.
|
// now we must do the rest without vectorization.
|
||||||
if(alignedSize == size) return res;
|
if(alignedSize == size) return res;
|
||||||
|
Loading…
Reference in New Issue
Block a user