fix some internal asserts in CacheFrinedlyProduct

2025-04-24 19:40:45 +08:00 · 2008-07-27 22:14:08 +00:00 · 2008-07-27 22:14:08 +00:00 · 44d95e0540
commit 44d95e0540
parent 02a7efa910
4 changed files with 24 additions and 18 deletions
--- a/Eigen/src/Core/CacheFriendlyProduct.h
+++ b/Eigen/src/Core/CacheFriendlyProduct.h
@ -359,19 +359,6 @@ static void ei_cache_friendly_product(

 #endif // EIGEN_EXTERN_INSTANTIATIONS

-template<typename Scalar>
-inline static int ei_alignmentOffset(const Scalar* ptr, int maxOffset)
-{
-  typedef typename ei_packet_traits<Scalar>::type Packet;
-  const int PacketSize = ei_packet_traits<Scalar>::size;
-  const int PacketAlignedMask = PacketSize-1;
-  const bool Vectorized = PacketSize>1;
-  return Vectorized
-          ? std::min<int>( (PacketSize - ((size_t(ptr)/sizeof(Scalar)) & PacketAlignedMask))
-                           & PacketAlignedMask, maxOffset)
-          : 0;
-}
-
 /* Optimized col-major matrix * vector product:
 * This algorithm processes 4 columns at onces that allows to both reduce
 * the number of load/stores of the result by a factor 4 and to reduce
@ -420,7 +407,7 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector(

  // we cannot assume the first element is aligned because of sub-matrices
  const int lhsAlignmentOffset = ei_alignmentOffset(lhs,size);
-  ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0);
+  ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0 || size<PacketSize || PacketSize==1);

  // find how many columns do we have to skip to be aligned with the result (if possible)
  int skipColumns=0;
@ -438,7 +425,7 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector(
    // note that the skiped columns are processed later.
  }

-  ei_internal_assert((alignmentPattern==NoneAligned)
+  ei_internal_assert((alignmentPattern==NoneAligned) || PacketSize==1
    || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(Packet))==0);

  int columnBound = ((rhs.size()-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
@ -585,7 +572,7 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector(

  // we cannot assume the first element is aligned because of sub-matrices
  const int lhsAlignmentOffset = ei_alignmentOffset(lhs,size);
-  ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0);
+  ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0  || PacketSize==1 || size<PacketSize);
  // find how many rows do we have to skip to be aligned with rhs (if possible)
  int skipRows=0;
  for (; skipRows<PacketSize && alignedStart != lhsAlignmentOffset + alignmentStep*skipRows; ++skipRows)
@ -601,7 +588,7 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector(
    skipRows = std::min(skipRows,res.size());
    // note that the skiped columns are processed later.
  }
-  ei_internal_assert((alignmentPattern==NoneAligned)
+  ei_internal_assert((alignmentPattern==NoneAligned) || PacketSize==1
    || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(Packet))==0);

  int rowBound = ((res.size()-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
--- a/Eigen/src/Core/DummyPacketMath.h
+++ b/Eigen/src/Core/DummyPacketMath.h
@ -120,5 +120,19 @@ template <typename Scalar, typename Packet, int LoadMode> inline void ei_pstoret
    ei_pstoreu(to, from);
 }

+/** \internal \returns the number of elements which have to be skipped such that data are aligned */
+template<typename Scalar>
+inline static int ei_alignmentOffset(const Scalar* ptr, int maxOffset)
+{
+  typedef typename ei_packet_traits<Scalar>::type Packet;
+  const int PacketSize = ei_packet_traits<Scalar>::size;
+  const int PacketAlignedMask = PacketSize-1;
+  const bool Vectorized = PacketSize>1;
+  return Vectorized
+          ? std::min<int>( (PacketSize - ((size_t(ptr)/sizeof(Scalar)) & PacketAlignedMask))
+                           & PacketAlignedMask, maxOffset)
+          : 0;
+}
+
 #endif // EIGEN_DUMMY_PACKET_MATH_H

--- a/Eigen/src/Core/InverseProduct.h
+++ b/Eigen/src/Core/InverseProduct.h
@ -98,6 +98,8 @@ struct ei_trisolve_selector<Lhs,Rhs,Upper,RowMajor>
 };

 // forward substitution, col-major
+// FIXME the Lower and Upper specialization could be merged using a small helper class
+// performing reflexions on the coordinates...
 template<typename Lhs, typename Rhs>
 struct ei_trisolve_selector<Lhs,Rhs,Lower,ColMajor>
 {
@ -138,6 +140,8 @@ struct ei_trisolve_selector<Lhs,Rhs,Lower,ColMajor>
         *   other.col(c).end(size-endBlock) += (lhs.block(endBlock, startBlock, size-endBlock, endBlock-startBlock)
         *                                       * other.col(c).block(startBlock,endBlock-startBlock)).lazy();
         */
+        // FIXME this is cool but what about conjugate/adjoint expressions ? do we want to evaluate them ?
+        // this is a more general problem though.
        ei_cache_friendly_product_colmajor_times_vector(
          size-endBlock, &(lhs.const_cast_derived().coeffRef(endBlock,startBlock)), lhs.stride(),
          btmp, &(other.coeffRef(endBlock,c)));
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@ -379,6 +379,7 @@ struct ei_product_coeff_vectorized_dyn_selector
 };

 // NOTE the 3 following specializations are because taking .col(0) on a vector is a bit slower
+// NOTE maybe they are now useless since we have a specialization for Block<Matrix>
 template<typename Lhs, typename Rhs, int RhsCols>
 struct ei_product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,RhsCols>
 {
@ -406,7 +407,7 @@ struct ei_product_coeff_vectorized_dyn_selector<Lhs,Rhs,LhsRows,1>
 template<typename Lhs, typename Rhs>
 struct ei_product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,1>
 {
-  inline static void run(int row, int /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
+  inline static void run(int /*row*/, int /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
  {
    res = ei_dot_impl<
      Lhs,