optimize linear vectorization both in Assign and Sum (optimal amortized perf)

2025-03-31 19:00:35 +08:00 · 2008-06-23 15:50:28 +00:00 · 2008-06-23 15:50:28 +00:00 · ac9aa47bbc
commit ac9aa47bbc
parent ea1990ef3d
2 changed files with 24 additions and 28 deletions
--- a/Eigen/src/Core/Assign.h
+++ b/Eigen/src/Core/Assign.h
@ -307,12 +307,17 @@ struct ei_assign_impl<Derived1, Derived2, LinearVectorization, NoUnrolling>
    int index = 0;

    // do the vectorizable part of the assignment
-    for ( ; index<alignedSize ; index+=packetSize)
+    int row = 0;
+    int col = 0;
+    while (index<alignedSize)
    {
-      // FIXME the following is not really efficient
-      const int row = rowMajor ? index/innerSize : index%innerSize;
-      const int col = rowMajor ? index%innerSize : index/innerSize;
-      dst.template writePacket<Aligned>(row, col, src.template packet<Aligned>(row, col));
+      int start = rowMajor ? col : row;
+      int end = std::min(innerSize, start + alignedSize-index);
+      for ( ; (rowMajor ? col : row)<end; (rowMajor ? col : row)+=packetSize)
+        dst.template writePacket<Aligned>(row, col, src.template packet<Aligned>(row, col));
+      index += (rowMajor ? col : row) - start;
+      row = rowMajor ? index/innerSize : index%innerSize;
+      col = rowMajor ? index%innerSize : index/innerSize;
    }

    // now we must do the rest without vectorization.
@ -380,7 +385,7 @@ struct ei_assign_impl<Derived1, Derived2, SliceVectorization, NoUnrolling>
    const int innerSize = rowMajor ? dst.cols() : dst.rows();
    const int outerSize = rowMajor ? dst.rows() : dst.cols();
    const int alignedInnerSize = (innerSize/packetSize)*packetSize;
-    
+
    for(int i = 0; i < outerSize; i++)
    {
      // do the vectorizable part of the assignment
--- a/Eigen/src/Core/Sum.h
+++ b/Eigen/src/Core/Sum.h
@ -54,7 +54,7 @@ public:
    Unrolling = Cost <= UnrollingLimit
              ? CompleteUnrolling
              : NoUnrolling
-  };       
+  };
 };

 /***************************************************************************
@ -62,7 +62,7 @@ public:
 ***************************************************************************/

 /*** no vectorization ***/
- 
+
 template<typename Derived, int Start, int Length>
 struct ei_sum_novec_unroller
 {
@ -194,32 +194,23 @@ struct ei_sum_impl<Derived, LinearVectorization, NoUnrolling>
    // do the vectorizable part of the sum
    if(size >= packetSize)
    {
-    asm("#begin");
-
      PacketScalar packet_res;
      packet_res = mat.template packet<Aligned>(0, 0);
-      int index;
-      if(Derived::IsVectorAtCompileTime)
+      int row = 0;
+      int col = 0;
+      int index = packetSize;
+      while (index<alignedSize)
      {
-        for(index = packetSize; index<alignedSize ; index+=packetSize)
-        {
-          const int row = Derived::RowsAtCompileTime==1 ? 0 : index;
-          const int col = Derived::RowsAtCompileTime==1 ? index : 0;
+        row = rowMajor ? index/innerSize : index%innerSize;
+        col = rowMajor ? index%innerSize : index/innerSize;
+        int start = rowMajor ? col : row;
+        int end = std::min(innerSize, start+alignedSize-index);
+        if (end<start) getchar();
+        for ( ; (rowMajor ? col : row)<end; (rowMajor ? col : row)+=packetSize)
          packet_res = ei_padd(packet_res, mat.template packet<Aligned>(row, col));
-        }
-      }
-      else
-      {
-        for(index = packetSize; index<alignedSize ; index+=packetSize)
-        {
-          // FIXME the following is not really efficient
-          const int row = rowMajor ? index/innerSize : index%innerSize;
-          const int col = rowMajor ? index%innerSize : index/innerSize;
-          packet_res = ei_padd(packet_res, mat.template packet<Aligned>(row, col));
-        }
+        index += (rowMajor ? col : row) - start;
      }
      res = ei_predux(packet_res);
-    asm("#end");

      // now we must do the rest without vectorization.
      if(alignedSize == size) return res;