From 35927e78c21b11113889f93325b980fc13349af9 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 24 Jul 2009 16:21:52 +0200
Subject: [PATCH] add WIP trsm

---
 Eigen/src/Core/products/GeneralMatrixMatrix.h |  20 +-
 .../Core/products/TriangularSolverMatrix.h    | 203 ++++++++++++++++++
 2 files changed, 214 insertions(+), 9 deletions(-)
 create mode 100644 Eigen/src/Core/products/TriangularSolverMatrix.h
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index 776b7c5d6..a1be8ea50 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -110,10 +110,12 @@ static void run(int rows, int cols, int depth,
 template<typename Scalar, int mr, int nr, typename Conj>
 struct ei_gebp_kernel
 {
-  void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int rows, int depth, int cols)
+  void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int rows, int depth, int cols, int strideA=-1, int strideB=-1, int offsetA=0, int offsetB=0)
   {
     typedef typename ei_packet_traits<Scalar>::type PacketType;
     enum { PacketSize = ei_packet_traits<Scalar>::size };
+    if(strideA==-1) strideA = depth;
+    if(strideB==-1) strideB = depth;
     Conj cj;
     int packet_cols = (cols/nr) * nr;
     const int peeled_mc = (rows/mr)*mr;
@@ -123,7 +125,7 @@ struct ei_gebp_kernel
       // loops on each register blocking of lhs/res
       for(int i=0; i<peeled_mc; i+=mr)
       {
-        const Scalar* blA = &blockA[i*depth];
+        const Scalar* blA = &blockA[i*strideA];
         #ifdef EIGEN_VECTORIZE_SSE
         _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
         #endif
@@ -144,7 +146,7 @@ struct ei_gebp_kernel
         // performs "inner" product
         // TODO let's check wether the flowing peeled loop could not be
         //      optimized via optimal prefetching from one loop to the other
-        const Scalar* blB = &blockB[j2*depth*PacketSize];
+        const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr];
         const int peeled_kc = (depth/4)*4;
         for(int k=0; k<peeled_kc; k+=4)
         {
@@ -246,14 +248,14 @@ struct ei_gebp_kernel
       }
       for(int i=peeled_mc; i<rows; i++)
       {
-        const Scalar* blA = &blockA[i*depth];
+        const Scalar* blA = &blockA[i*strideA];
         #ifdef EIGEN_VECTORIZE_SSE
         _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
         #endif
 
         // gets a 1 x nr res block as registers
         Scalar C0(0), C1(0), C2(0), C3(0);
-        const Scalar* blB = &blockB[j2*depth*PacketSize];
+        const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr];
         for(int k=0; k<depth; k++)
         {
           Scalar B0, B1, B2, B3, A0;
@@ -283,7 +285,7 @@ struct ei_gebp_kernel
     {
       for(int i=0; i<peeled_mc; i+=mr)
       {
-        const Scalar* blA = &blockA[i*depth];
+        const Scalar* blA = &blockA[i*strideA];
         #ifdef EIGEN_VECTORIZE_SSE
         _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
         #endif
@@ -295,7 +297,7 @@ struct ei_gebp_kernel
         C0 = ei_ploadu(&res[(j2+0)*resStride + i]);
         C4 = ei_ploadu(&res[(j2+0)*resStride + i + PacketSize]);
 
-        const Scalar* blB = &blockB[j2*depth*PacketSize];
+        const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB];
         for(int k=0; k<depth; k++)
         {
           PacketType B0, A0, A1;
@@ -315,14 +317,14 @@ struct ei_gebp_kernel
       }
       for(int i=peeled_mc; i<rows; i++)
       {
-        const Scalar* blA = &blockA[i*depth];
+        const Scalar* blA = &blockA[i*strideA];
         #ifdef EIGEN_VECTORIZE_SSE
         _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
         #endif
 
         // gets a 1 x 1 res block as registers
         Scalar C0(0);
-        const Scalar* blB = &blockB[j2*depth*PacketSize];
+        const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB];
         for(int k=0; k<depth; k++)
           C0 = cj.pmadd(blA[k], blB[k*PacketSize], C0);
         res[(j2+0)*resStride + i] += C0;
diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h
new file mode 100644
index 000000000..0a5b4749f
--- /dev/null
+++ b/Eigen/src/Core/products/TriangularSolverMatrix.h
@@ -0,0 +1,203 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <g.gael@free.fr>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef EIGEN_TRIANGULAR_SOLVER_MATRIX_H
+#define EIGEN_TRIANGULAR_SOLVER_MATRIX_H
+
+/* Optimized triangular solver with multiple right hand side (_TRSM)
+ */
+template <typename Scalar,
+          int LhsStorageOrder,
+          int RhsStorageOrder,
+          int Mode>
+struct ei_triangular_solve_matrix//<Scalar,LhsStorageOrder,RhsStorageOrder>
+{
+
+  static EIGEN_DONT_INLINE void run(
+    int size, int cols,
+    const Scalar* _lhs, int lhsStride,
+          Scalar* _rhs, int rhsStride)
+  {
+    Map<Matrix<Scalar,Dynamic,Dynamic,LhsStorageOrder> > lhs(_lhs, size, size);
+    Map<Matrix<Scalar,Dynamic,Dynamic,RhsStorageOrder> > rhs(_rhs, size, cols);
+    //ei_const_blas_data_mapper<Scalar, LhsStorageOrder> lhs(_lhs,lhsStride);
+    //ei_const_blas_data_mapper<Scalar, RhsStorageOrder> rhs(_rhs,rhsStride);
+
+    typedef ei_product_blocking_traits<Scalar> Blocking;
+    enum {
+      SmallPanelWidth   = EIGEN_ENUM_MAX(Blocking::mr,Blocking::nr),
+      IsLowerTriangular = (Mode&LowerTriangular) == LowerTriangular
+    };
+
+    int kc = 8;//std::min<int>(Blocking::Max_kc,size);  // cache block size along the K direction
+    int mc = 8;//std::min<int>(Blocking::Max_mc,size);  // cache block size along the M direction
+
+    Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
+    Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize);
+
+    ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<false,false> > gebp_kernel;
+
+    for(int k2=0; k2<size; k2+=kc)
+    {
+      const int actual_kc = std::min(k2+kc,size)-k2;
+
+      // We have selected and packed a big horizontal panel R1 of rhs. Let B be the packed copy of this panel,
+      // and R2 the remaining part of rhs. The corresponding vertical panel of lhs is split into
+      // A11 (the triangular part) and A21 the remaining rectangular part.
+      // Then the high level algorithm is:
+      //  - B = R1                    => general block copy
+      //  - R1 = L1^-1 B              => tricky part
+      //  - update B from the new R1  => actually this has to performed continuously during the above step
+      //  - R2 = L2 * B               => GEPP
+
+      // B = R1
+      ei_gemm_pack_rhs<Scalar,Blocking::nr,RhsStorageOrder>()
+        (blockB, &rhs(k2,0), rhsStride, -1, actual_kc, cols);
+
+        Map<MatrixXf>(blockB,Blocking::PacketSize*Blocking::nr*actual_kc, cols/Blocking::nr+(cols%Blocking::nr)).setZero();
+
+      // The tricky part: R1 = L1^-1 B while updating B from R1
+      // The idea is to split L1 into multiple small vertical panels.
+      // Each panel can be split into a small triangular part A1 which is processed without optimization,
+      // and the remaining small part A2 which is processed using gebp with appropriate block strides
+      {
+        // pack L1
+//         ei_gemm_pack_lhs<Scalar,Blocking::mr,LhsStorageOrder>()
+//           (blockA, &lhs(k2, k2), lhsStride, actual_kc, actual_kc);
+
+        // for each small vertical panels of lhs
+        for (int k1=0; k1<actual_kc; k1+=SmallPanelWidth)
+        {
+          int actualPanelWidth = std::min<int>(SmallPanelWidth,actual_kc-k1);
+          // tr solve
+          for (int k=0; k<actualPanelWidth; ++k)
+          {
+            int i = k2+k1+k;
+            if(!(Mode & UnitDiagBit))
+              rhs.row(i) /= lhs(i,i);
+
+            int rs = actualPanelWidth - k - 1; // remaining size
+            //std::cerr << i << " ; " << k << " " << rs << "\n";
+            if (rs>0)
+            {
+              rhs.block(i+1,0,rs,cols) -=
+                    lhs.col(i).segment(IsLowerTriangular ? i+1 : i-rs, rs) * rhs.row(i);
+            }
+          }
+          // update the respective row of B from rhs
+          {
+            const Scalar* lr = _rhs+k2+k1;
+            int packet_cols = (cols/Blocking::nr) * Blocking::nr;
+            int count = 0;
+            for(int j2=0; j2<packet_cols; j2+=Blocking::nr)
+            {
+              // skip what we have before
+              count += Blocking::PacketSize * Blocking::nr * (k1-k2);
+              const Scalar* b0 = &lr[(j2+0)*rhsStride];
+              const Scalar* b1 = &lr[(j2+1)*rhsStride];
+              const Scalar* b2 = &lr[(j2+2)*rhsStride];
+              const Scalar* b3 = &lr[(j2+3)*rhsStride];
+              for(int k=0; k<actualPanelWidth; k++)
+              {
+                ei_pstore(&blockB[count+0*Blocking::PacketSize], ei_pset1(-b0[k]));
+                ei_pstore(&blockB[count+1*Blocking::PacketSize], ei_pset1(-b1[k]));
+                if (Blocking::nr==4)
+                {
+                  ei_pstore(&blockB[count+2*Blocking::PacketSize], ei_pset1(-b2[k]));
+                  ei_pstore(&blockB[count+3*Blocking::PacketSize], ei_pset1(-b3[k]));
+                }
+                count += Blocking::nr*Blocking::PacketSize;
+              }
+              // skip what we have after
+              count += Blocking::PacketSize * Blocking::nr * (actual_kc-k1-actualPanelWidth);
+            }
+            // copy the remaining columns one at a time (nr==1)
+            for(int j2=packet_cols; j2<cols; ++j2)
+            {
+              count += Blocking::PacketSize * (k1-k2);
+              const Scalar* b0 = &lr[(j2+0)*rhsStride];
+              for(int k=0; k<actualPanelWidth; k++)
+              {
+                ei_pstore(&blockB[count], ei_pset1(-b0[k]));
+                count += Blocking::PacketSize;
+              }
+              count += Blocking::PacketSize * (actual_kc-k1-actualPanelWidth);
+            }
+          }
+
+//           std::cerr << Map<MatrixXf>(blockB,Blocking::PacketSize*Blocking::nr*actual_kc, cols/Blocking::nr+(cols%Blocking::nr)) << "\n\n";
+
+//           MatrixXf aux(Blocking::PacketSize*Blocking::nr*actual_kc, cols/Blocking::nr+(cols%Blocking::nr));
+//           aux.setZero();
+
+//           ei_gemm_pack_rhs<Scalar,Blocking::nr,RhsStorageOrder>()
+//                           (aux.data(), &rhs(k2,0), rhsStride, -1, actual_kc, cols);
+
+//           std::cerr << Map<MatrixXf>(blockB,Blocking::PacketSize*Blocking::nr*actual_kc, cols/Blocking::nr+(cols%Blocking::nr)) - aux << "\n\n";
+
+
+          // gebp
+          int i = k1+actualPanelWidth;
+          int rs = actual_kc-i;
+
+//           ei_gemm_pack_rhs<Scalar,Blocking::nr,RhsStorageOrder>()
+//                           (blockB, &rhs(k1,0), rhsStride, -1, actualPanelWidth, cols);
+
+          ei_gemm_pack_lhs<Scalar,Blocking::mr,LhsStorageOrder>()
+                          (blockA, &lhs(k2+i, k2+k1), lhsStride, actualPanelWidth, rs);
+
+          if (rs>0)
+            rhs.block(i,0,actual_kc-i,cols) -= lhs.block(i,k1,rs,actualPanelWidth) * rhs.block(k1,0,actualPanelWidth,cols);
+
+//             gebp_kernel(_rhs+i+k2, rhsStride,
+//                       blockA/*+actual_kc*i+k1*rs*/, blockB/*+k1*Blocking::PacketSize*Blocking::nr*/, rs, actualPanelWidth, cols, actualPanelWidth/*actual_kc*/, actual_kc, 0, k1*Blocking::PacketSize);
+
+//           gebp_kernel(_rhs+i, rhsStride,
+//                       blockA+actual_kc*i+k1*rs, blockB+k1*Blocking::PacketSize*Blocking::nr, rs, actualPanelWidth, cols, actual_kc, actual_kc);
+
+//           gebp_kernel(_rhs+k2+i, rhsStride,
+//                       blockA+actual_kc*i+k1, blockB+k1*Blocking::PacketSize, actual_kc-i, actualPanelWidth, cols, actual_kc, actual_kc);
+        }
+      }
+
+      //  - R2 = A2 * B => GEPP
+      for(int i2=k2+kc; i2<size; i2+=mc)
+      {
+        const int actual_mc = std::min(i2+mc,size)-i2;
+        ei_gemm_pack_lhs<Scalar,Blocking::mr,LhsStorageOrder>()
+          (blockA, &lhs(k2, i2), lhsStride, actual_kc, actual_mc);
+
+        std::cerr << i2 << " sur " << actual_mc << " -= " << i2 << "x" << k2 << "+" << actual_mc<<"," <<actual_kc << " * " << k2 << " sur " << actual_kc << "\n";
+        rhs.block(i2,0,actual_mc,cols) -= lhs.block(i2,k2,actual_mc,actual_kc) * rhs.block(k2,0,actual_kc,cols);
+
+//         gebp_kernel(_rhs+i2, rhsStride, blockA, blockB, actual_mc, actual_kc, cols);
+      }
+    }
+
+    ei_aligned_stack_delete(Scalar, blockA, kc*mc);
+    ei_aligned_stack_delete(Scalar, blockB, kc*cols*Blocking::PacketSize);
+  }
+};
+
+#endif // EIGEN_TRIANGULAR_SOLVER_MATRIX_H