Generalize first_aligned to take the requested alignment as a template parameter, and add a first_default_aligned variante calling first_aligned with the requirement of the largest packet for the given scalar type.

2025-04-24 19:40:45 +08:00 · 2015-08-06 17:52:01 +02:00 · 2015-08-06 17:52:01 +02:00 · 2afdef6a54
commit 2afdef6a54
parent 1f5024332e
14 changed files with 69 additions and 48 deletions
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h
@ -365,13 +365,14 @@ struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, NoUnrolling>
    typedef typename Kernel::Scalar Scalar;
    typedef packet_traits<Scalar> PacketTraits;
    enum {
+      requestedAlignment = Kernel::AssignmentTraits::RequiredAlignment,
      packetSize = PacketTraits::size,
-      dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(Kernel::AssignmentTraits::RequiredAlignment),
-      dstAlignment = PacketTraits::AlignedOnScalar ? int(Kernel::AssignmentTraits::RequiredAlignment)
+      dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(requestedAlignment),
+      dstAlignment = PacketTraits::AlignedOnScalar ? int(requestedAlignment)
                                                   : int(Kernel::AssignmentTraits::DstAlignment),
      srcAlignment = Kernel::AssignmentTraits::JointAlignment
    };
-    const Index alignedStart = dstIsAligned ? 0 : internal::first_aligned(&kernel.dstEvaluator().coeffRef(0), size);
+    const Index alignedStart = dstIsAligned ? 0 : internal::first_aligned<requestedAlignment>(&kernel.dstEvaluator().coeffRef(0), size);
    const Index alignedEnd = alignedStart + ((size-alignedStart)/packetSize)*packetSize;

    unaligned_dense_assignment_loop<dstIsAligned!=0>::run(kernel, 0, alignedStart);
@ -479,9 +480,10 @@ struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling>
    typedef packet_traits<Scalar> PacketTraits;
    enum {
      packetSize = PacketTraits::size,
+      requestedAlignment = int(Kernel::AssignmentTraits::RequiredAlignment),
      alignable = PacketTraits::AlignedOnScalar || int(Kernel::AssignmentTraits::DstAlignment)>=sizeof(Scalar),
-      dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(Kernel::AssignmentTraits::RequiredAlignment),
-      dstAlignment = alignable ? int(Kernel::AssignmentTraits::RequiredAlignment)
+      dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(requestedAlignment),
+      dstAlignment = alignable ? int(requestedAlignment)
                               : int(Kernel::AssignmentTraits::DstAlignment)
    };
    const Scalar *dst_ptr = &kernel.dstEvaluator().coeffRef(0,0);
@ -494,7 +496,7 @@ struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling>
    const Index innerSize = kernel.innerSize();
    const Index outerSize = kernel.outerSize();
    const Index alignedStep = alignable ? (packetSize - kernel.outerStride() % packetSize) & packetAlignedMask : 0;
-    Index alignedStart = ((!alignable) || bool(dstIsAligned)) ? 0 : internal::first_aligned(dst_ptr, innerSize);
+    Index alignedStart = ((!alignable) || bool(dstIsAligned)) ? 0 : internal::first_aligned<requestedAlignment>(dst_ptr, innerSize);

    for(Index outer = 0; outer < outerSize; ++outer)
    {
--- a/Eigen/src/Core/DenseCoeffsBase.h
+++ b/Eigen/src/Core/DenseCoeffsBase.h
@ -580,33 +580,41 @@ class DenseCoeffsBase<Derived, DirectWriteAccessors>

 namespace internal {

-template<typename Derived, bool JustReturnZero>
+template<int Alignment, typename Derived, bool JustReturnZero>
 struct first_aligned_impl
 {
  static inline Index run(const Derived&)
  { return 0; }
 };

-template<typename Derived>
-struct first_aligned_impl<Derived, false>
+template<int Alignment, typename Derived>
+struct first_aligned_impl<Alignment, Derived, false>
 {
  static inline Index run(const Derived& m)
  {
-    return internal::first_aligned(&m.const_cast_derived().coeffRef(0,0), m.size());
+    return internal::first_aligned<Alignment>(&m.const_cast_derived().coeffRef(0,0), m.size());
  }
 };

-/** \internal \returns the index of the first element of the array that is well aligned for vectorization.
+/** \internal \returns the index of the first element of the array stored by \a m that is properly aligned with respect to \a Alignment for vectorization.
+  *
+  * \tparam Alignment requested alignment in Bytes.
  *
  * There is also the variant first_aligned(const Scalar*, Integer) defined in Memory.h. See it for more
  * documentation.
  */
-template<typename Derived>
+template<int Alignment, typename Derived>
 static inline Index first_aligned(const DenseBase<Derived>& m)
 {
-  return first_aligned_impl
-          <Derived, (evaluator<Derived>::Alignment > 0 ) || !(Derived::Flags & DirectAccessBit)> // FIXME Alignment!
-          ::run(m.derived());
+  enum { ReturnZero = (int(evaluator<Derived>::Alignment) >= Alignment) || !(Derived::Flags & DirectAccessBit) };
+  return first_aligned_impl<Alignment, Derived, ReturnZero>::run(m.derived());
+}
+
+template<typename Derived>
+static inline Index first_default_aligned(const DenseBase<Derived>& m)
+{
+  typedef typename Derived::Scalar Scalar;
+  return first_aligned<packet_traits<Scalar>::size*sizeof(Scalar)>(m);
 }

 template<typename Derived, bool HasDirectAccess = has_direct_access<Derived>::ret>
--- a/Eigen/src/Core/Redux.h
+++ b/Eigen/src/Core/Redux.h
@ -221,12 +221,13 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
  {
    const Index size = mat.size();
    
-    const Index packetSize = packet_traits<Scalar>::size;
-    const Index alignedStart = internal::first_aligned(mat.nestedExpression());
+    const Index packetSize  = packet_traits<Scalar>::size;
+    const int packetBytes = int(packetSize*sizeof(Scalar));
    enum {
-      alignment0 = (bool(Derived::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(sizeof(Scalar)*packetSize) : int(Unaligned), // FIXME take into account alignment requirement
+      alignment0 = (bool(Derived::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(packetBytes) : int(Unaligned), // FIXME take into account alignment requirement
      alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Derived::Alignment)
    };
+    const Index alignedStart = internal::first_default_aligned(mat.nestedExpression());
    const Index alignedSize2 = ((size-alignedStart)/(2*packetSize))*(2*packetSize);
    const Index alignedSize = ((size-alignedStart)/(packetSize))*(packetSize);
    const Index alignedEnd2 = alignedStart + alignedSize2;
--- a/Eigen/src/Core/StableNorm.h
+++ b/Eigen/src/Core/StableNorm.h
@ -178,7 +178,7 @@ MatrixBase<Derived>::stableNorm() const
  if(n==1)
    return abs(this->coeff(0));
  
-  Index bi = internal::first_aligned(copy);
+  Index bi = internal::first_default_aligned(copy);
  if (bi>0)
    internal::stable_norm_kernel(copy.head(bi), ssq, scale, invScale);
  for (; bi<n; bi+=blockSize)
--- a/Eigen/src/Core/products/GeneralMatrixVector.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector.h
@ -125,7 +125,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,C

  // How many coeffs of the result do we have to skip to be aligned.
  // Here we assume data are at least aligned on the base scalar type.
-  Index alignedStart = internal::first_aligned(res,size);
+  Index alignedStart = internal::first_default_aligned(res,size);
  Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0;
  const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;

--- a/Eigen/src/Core/products/SelfadjointMatrixVector.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h
@ -94,7 +94,7 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd

    size_t starti = FirstTriangular ? 0 : j+2;
    size_t endi   = FirstTriangular ? j : size;
-    size_t alignedStart = (starti) + internal::first_aligned(&res[starti], endi-starti);
+    size_t alignedStart = (starti) + internal::first_default_aligned(&res[starti], endi-starti);
    size_t alignedEnd = alignedStart + ((endi-alignedStart)/(PacketSize))*(PacketSize);

    // TODO make sure this product is a real * complex and that the rhs is properly conjugated if needed
--- a/Eigen/src/Core/products/TriangularMatrixMatrix.h
+++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h
@ -257,6 +257,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
    Scalar* _res,        Index resStride,
    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
  {
+    const Index PacketBytes = packet_traits<Scalar>::size*sizeof(Scalar);
    // strip zeros
    Index diagSize  = (std::min)(_cols,_depth);
    Index rows      = _rows;
@ -311,7 +312,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
      Index ts = (IsLower && actual_k2>=cols) ? 0 : actual_kc;

      Scalar* geb = blockB+ts*ts;
-      geb = geb + internal::first_aligned(geb,EIGEN_MAX_ALIGN_BYTES/sizeof(Scalar));
+      geb = geb + internal::first_aligned<PacketBytes>(geb,PacketBytes/sizeof(Scalar));

      pack_rhs(geb, rhs.getSubMapper(actual_k2,IsLower ? 0 : k2), actual_kc, rs);

--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@ -230,7 +230,7 @@ class blas_data_mapper {
    if (size_t(m_data)%sizeof(Scalar)) {
      return -1;
    }
-    return internal::first_aligned(m_data, size);
+    return internal::first_default_aligned(m_data, size);
  }

  protected:
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@ -506,47 +506,56 @@ template<typename T, bool Align> EIGEN_DEVICE_FUNC inline void conditional_align

 /****************************************************************************/

-/** \internal Returns the index of the first element of the array that is well aligned for vectorization.
+/** \internal Returns the index of the first element of the array that is well aligned with respect to the requested \a Alignment.
  *
+  * \tparam Alignment requested alignment in Bytes.
  * \param array the address of the start of the array
  * \param size the size of the array
  *
-  * \note If no element of the array is well aligned, the size of the array is returned. Typically,
-  * for example with SSE, "well aligned" means 16-byte-aligned. If vectorization is disabled or if the
+  * \note If no element of the array is well aligned or the requested alignment is not a multiple of a scalar,
+  * the size of the array is returned. For example with SSE, the requested alignment is typically 16-bytes. If
  * packet size for the given scalar type is 1, then everything is considered well-aligned.
  *
-  * \note If the scalar type is vectorizable, we rely on the following assumptions: sizeof(Scalar) is a
-  * power of 2, the packet size in bytes is also a power of 2, and is a multiple of sizeof(Scalar). On the
-  * other hand, we do not assume that the array address is a multiple of sizeof(Scalar), as that fails for
+  * \note Otherwise, if the Alignment is larger that the scalar size, we rely on the assumptions that sizeof(Scalar) is a
+  * power of 2. On the other hand, we do not assume that the array address is a multiple of sizeof(Scalar), as that fails for
  * example with Scalar=double on certain 32-bit platforms, see bug #79.
  *
  * There is also the variant first_aligned(const MatrixBase&) defined in DenseCoeffsBase.h.
+  * \sa first_default_aligned()
  */
-template<typename Scalar, typename Index>
+template<int Alignment, typename Scalar, typename Index>
 inline Index first_aligned(const Scalar* array, Index size)
 {
-  static const Index PacketSize = packet_traits<Scalar>::size;
-  static const Index PacketAlignedMask = PacketSize-1;
+  static const Index ScalarSize = sizeof(Scalar);
+  static const Index AlignmentSize = Alignment / ScalarSize;
+  static const Index AlignmentMask = AlignmentSize-1;

-  if(PacketSize==1)
+  if(AlignmentSize<=1)
  {
-    // Either there is no vectorization, or a packet consists of exactly 1 scalar so that all elements
-    // of the array have the same alignment.
+    // Either the requested alignment if smaller than a scalar, or it exactly match a 1 scalar
+    // so that all elements of the array have the same alignment.
    return 0;
  }
-  else if(size_t(array) & (sizeof(Scalar)-1))
+  else if( (std::size_t(array) & (sizeof(Scalar)-1)) || (Alignment%ScalarSize)!=0)
  {
-    // There is vectorization for this scalar type, but the array is not aligned to the size of a single scalar.
+    // The array is not aligned to the size of a single scalar, or the requested alignment is not a multiple of the scalar size.
    // Consequently, no element of the array is well aligned.
    return size;
  }
  else
  {
-    return std::min<Index>( (PacketSize - (Index((size_t(array)/sizeof(Scalar))) & PacketAlignedMask))
-                           & PacketAlignedMask, size);
+    return std::min<Index>( (AlignmentSize - (Index((std::size_t(array)/sizeof(Scalar))) & AlignmentMask)) & AlignmentMask, size);
  }
 }

+/** \internal Returns the index of the first element of the array that is well aligned with respect the largest packet requirement.
+   * \sa first_aligned(Scalar*,Index) and first_default_aligned(DenseBase<Derived>) */
+template<typename Scalar, typename Index>
+inline Index first_default_aligned(const Scalar* array, Index size)
+{
+  return first_aligned<packet_traits<Scalar>::size*sizeof(Scalar)>(array, size);
+}
+
 /** \internal Returns the smallest integer multiple of \a base and greater or equal to \a size
  */ 
 template<typename Index> 
--- a/Eigen/src/Jacobi/Jacobi.h
+++ b/Eigen/src/Jacobi/Jacobi.h
@ -325,7 +325,7 @@ void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x
    // both vectors are sequentially stored in memory => vectorization
    enum { Peeling = 2 };

-    Index alignedStart = internal::first_aligned(y, size);
+    Index alignedStart = internal::first_default_aligned(y, size);
    Index alignedEnd = alignedStart + ((size-alignedStart)/PacketSize)*PacketSize;

    const Packet pc = pset1<Packet>(c);
@ -343,7 +343,7 @@ void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x
    Scalar* EIGEN_RESTRICT px = x + alignedStart;
    Scalar* EIGEN_RESTRICT py = y + alignedStart;

-    if(internal::first_aligned(x, size)==alignedStart)
+    if(internal::first_default_aligned(x, size)==alignedStart)
    {
      for(Index i=alignedStart; i<alignedEnd; i+=PacketSize)
      {
--- a/Eigen/src/SparseLU/SparseLU_gemm_kernel.h
+++ b/Eigen/src/SparseLU/SparseLU_gemm_kernel.h
@ -39,9 +39,9 @@ void sparselu_gemm(Index m, Index n, Index d, const Scalar* A, Index lda, const
  };
  Index d_end = (d/RK)*RK;    // number of columns of A (rows of B) suitable for full register blocking
  Index n_end = (n/RN)*RN;    // number of columns of B-C suitable for processing RN columns at once
-  Index i0 = internal::first_aligned(A,m);
+  Index i0 = internal::first_default_aligned(A,m);
  
-  eigen_internal_assert(((lda%PacketSize)==0) && ((ldc%PacketSize)==0) && (i0==internal::first_aligned(C,m)));
+  eigen_internal_assert(((lda%PacketSize)==0) && ((ldc%PacketSize)==0) && (i0==internal::first_default_aligned(C,m)));
  
  // handle the non aligned rows of A and C without any optimization:
  for(Index i=0; i<i0; ++i)
--- a/Eigen/src/SparseLU/SparseLU_kernel_bmod.h
+++ b/Eigen/src/SparseLU/SparseLU_kernel_bmod.h
@ -66,8 +66,8 @@ EIGEN_DONT_INLINE void LU_kernel_bmod<SegSizeAtCompileTime>::run(const Index seg
  const Index PacketSize = internal::packet_traits<Scalar>::size;
  Index ldl = internal::first_multiple(nrow, PacketSize);
  Map<Matrix<Scalar,Dynamic,SegSizeAtCompileTime>, 0, OuterStride<> > B( &(lusup.data()[luptr]), nrow, segsize, OuterStride<>(lda) );
-  Index aligned_offset = internal::first_aligned(tempv.data()+segsize, PacketSize);
-  Index aligned_with_B_offset = (PacketSize-internal::first_aligned(B.data(), PacketSize))%PacketSize;
+  Index aligned_offset = internal::first_default_aligned(tempv.data()+segsize, PacketSize);
+  Index aligned_with_B_offset = (PacketSize-internal::first_default_aligned(B.data(), PacketSize))%PacketSize;
  Map<Matrix<Scalar,Dynamic,1>, 0, OuterStride<> > l(tempv.data()+segsize+aligned_offset+aligned_with_B_offset, nrow, OuterStride<>(ldl) );
  
  l.setZero();
--- a/Eigen/src/SparseLU/SparseLU_panel_bmod.h
+++ b/Eigen/src/SparseLU/SparseLU_panel_bmod.h
@ -145,7 +145,7 @@ void SparseLUImpl<Scalar,StorageIndex>::panel_bmod(const Index m, const Index w,
      eigen_assert(tempv.size()>w*ldu + nrow*w + 1);
      
      Index ldl = internal::first_multiple<Index>(nrow, PacketSize);
-      Index offset = (PacketSize-internal::first_aligned(B.data(), PacketSize)) % PacketSize;
+      Index offset = (PacketSize-internal::first_default_aligned(B.data(), PacketSize)) % PacketSize;
      Map<Matrix<Scalar,Dynamic,Dynamic>, 0, OuterStride<> > L(tempv.data()+w*ldu+offset, nrow, u_cols, OuterStride<>(ldl));
      
      L.setZero();
--- a/test/first_aligned.cpp
+++ b/test/first_aligned.cpp
@ -13,7 +13,7 @@ template<typename Scalar>
 void test_first_aligned_helper(Scalar *array, int size)
 {
  const int packet_size = sizeof(Scalar) * internal::packet_traits<Scalar>::size;
-  VERIFY(((size_t(array) + sizeof(Scalar) * internal::first_aligned(array, size)) % packet_size) == 0);
+  VERIFY(((size_t(array) + sizeof(Scalar) * internal::first_default_aligned(array, size)) % packet_size) == 0);
 }

 template<typename Scalar>
@ -21,7 +21,7 @@ void test_none_aligned_helper(Scalar *array, int size)
 {
  EIGEN_UNUSED_VARIABLE(array);
  EIGEN_UNUSED_VARIABLE(size);
-  VERIFY(internal::packet_traits<Scalar>::size == 1 || internal::first_aligned(array, size) == size);
+  VERIFY(internal::packet_traits<Scalar>::size == 1 || internal::first_default_aligned(array, size) == size);
 }

 struct some_non_vectorizable_type { float x; };