mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-03-25 18:50:40 +08:00
Generalize first_aligned to take the requested alignment as a template parameter, and add a first_default_aligned variante calling first_aligned with the requirement of the largest packet for the given scalar type.
This commit is contained in:
parent
1f5024332e
commit
2afdef6a54
@ -365,13 +365,14 @@ struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, NoUnrolling>
|
||||
typedef typename Kernel::Scalar Scalar;
|
||||
typedef packet_traits<Scalar> PacketTraits;
|
||||
enum {
|
||||
requestedAlignment = Kernel::AssignmentTraits::RequiredAlignment,
|
||||
packetSize = PacketTraits::size,
|
||||
dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(Kernel::AssignmentTraits::RequiredAlignment),
|
||||
dstAlignment = PacketTraits::AlignedOnScalar ? int(Kernel::AssignmentTraits::RequiredAlignment)
|
||||
dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(requestedAlignment),
|
||||
dstAlignment = PacketTraits::AlignedOnScalar ? int(requestedAlignment)
|
||||
: int(Kernel::AssignmentTraits::DstAlignment),
|
||||
srcAlignment = Kernel::AssignmentTraits::JointAlignment
|
||||
};
|
||||
const Index alignedStart = dstIsAligned ? 0 : internal::first_aligned(&kernel.dstEvaluator().coeffRef(0), size);
|
||||
const Index alignedStart = dstIsAligned ? 0 : internal::first_aligned<requestedAlignment>(&kernel.dstEvaluator().coeffRef(0), size);
|
||||
const Index alignedEnd = alignedStart + ((size-alignedStart)/packetSize)*packetSize;
|
||||
|
||||
unaligned_dense_assignment_loop<dstIsAligned!=0>::run(kernel, 0, alignedStart);
|
||||
@ -479,9 +480,10 @@ struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling>
|
||||
typedef packet_traits<Scalar> PacketTraits;
|
||||
enum {
|
||||
packetSize = PacketTraits::size,
|
||||
requestedAlignment = int(Kernel::AssignmentTraits::RequiredAlignment),
|
||||
alignable = PacketTraits::AlignedOnScalar || int(Kernel::AssignmentTraits::DstAlignment)>=sizeof(Scalar),
|
||||
dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(Kernel::AssignmentTraits::RequiredAlignment),
|
||||
dstAlignment = alignable ? int(Kernel::AssignmentTraits::RequiredAlignment)
|
||||
dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(requestedAlignment),
|
||||
dstAlignment = alignable ? int(requestedAlignment)
|
||||
: int(Kernel::AssignmentTraits::DstAlignment)
|
||||
};
|
||||
const Scalar *dst_ptr = &kernel.dstEvaluator().coeffRef(0,0);
|
||||
@ -494,7 +496,7 @@ struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling>
|
||||
const Index innerSize = kernel.innerSize();
|
||||
const Index outerSize = kernel.outerSize();
|
||||
const Index alignedStep = alignable ? (packetSize - kernel.outerStride() % packetSize) & packetAlignedMask : 0;
|
||||
Index alignedStart = ((!alignable) || bool(dstIsAligned)) ? 0 : internal::first_aligned(dst_ptr, innerSize);
|
||||
Index alignedStart = ((!alignable) || bool(dstIsAligned)) ? 0 : internal::first_aligned<requestedAlignment>(dst_ptr, innerSize);
|
||||
|
||||
for(Index outer = 0; outer < outerSize; ++outer)
|
||||
{
|
||||
|
@ -580,33 +580,41 @@ class DenseCoeffsBase<Derived, DirectWriteAccessors>
|
||||
|
||||
namespace internal {
|
||||
|
||||
template<typename Derived, bool JustReturnZero>
|
||||
template<int Alignment, typename Derived, bool JustReturnZero>
|
||||
struct first_aligned_impl
|
||||
{
|
||||
static inline Index run(const Derived&)
|
||||
{ return 0; }
|
||||
};
|
||||
|
||||
template<typename Derived>
|
||||
struct first_aligned_impl<Derived, false>
|
||||
template<int Alignment, typename Derived>
|
||||
struct first_aligned_impl<Alignment, Derived, false>
|
||||
{
|
||||
static inline Index run(const Derived& m)
|
||||
{
|
||||
return internal::first_aligned(&m.const_cast_derived().coeffRef(0,0), m.size());
|
||||
return internal::first_aligned<Alignment>(&m.const_cast_derived().coeffRef(0,0), m.size());
|
||||
}
|
||||
};
|
||||
|
||||
/** \internal \returns the index of the first element of the array that is well aligned for vectorization.
|
||||
/** \internal \returns the index of the first element of the array stored by \a m that is properly aligned with respect to \a Alignment for vectorization.
|
||||
*
|
||||
* \tparam Alignment requested alignment in Bytes.
|
||||
*
|
||||
* There is also the variant first_aligned(const Scalar*, Integer) defined in Memory.h. See it for more
|
||||
* documentation.
|
||||
*/
|
||||
template<typename Derived>
|
||||
template<int Alignment, typename Derived>
|
||||
static inline Index first_aligned(const DenseBase<Derived>& m)
|
||||
{
|
||||
return first_aligned_impl
|
||||
<Derived, (evaluator<Derived>::Alignment > 0 ) || !(Derived::Flags & DirectAccessBit)> // FIXME Alignment!
|
||||
::run(m.derived());
|
||||
enum { ReturnZero = (int(evaluator<Derived>::Alignment) >= Alignment) || !(Derived::Flags & DirectAccessBit) };
|
||||
return first_aligned_impl<Alignment, Derived, ReturnZero>::run(m.derived());
|
||||
}
|
||||
|
||||
template<typename Derived>
|
||||
static inline Index first_default_aligned(const DenseBase<Derived>& m)
|
||||
{
|
||||
typedef typename Derived::Scalar Scalar;
|
||||
return first_aligned<packet_traits<Scalar>::size*sizeof(Scalar)>(m);
|
||||
}
|
||||
|
||||
template<typename Derived, bool HasDirectAccess = has_direct_access<Derived>::ret>
|
||||
|
@ -221,12 +221,13 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
|
||||
{
|
||||
const Index size = mat.size();
|
||||
|
||||
const Index packetSize = packet_traits<Scalar>::size;
|
||||
const Index alignedStart = internal::first_aligned(mat.nestedExpression());
|
||||
const Index packetSize = packet_traits<Scalar>::size;
|
||||
const int packetBytes = int(packetSize*sizeof(Scalar));
|
||||
enum {
|
||||
alignment0 = (bool(Derived::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(sizeof(Scalar)*packetSize) : int(Unaligned), // FIXME take into account alignment requirement
|
||||
alignment0 = (bool(Derived::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(packetBytes) : int(Unaligned), // FIXME take into account alignment requirement
|
||||
alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Derived::Alignment)
|
||||
};
|
||||
const Index alignedStart = internal::first_default_aligned(mat.nestedExpression());
|
||||
const Index alignedSize2 = ((size-alignedStart)/(2*packetSize))*(2*packetSize);
|
||||
const Index alignedSize = ((size-alignedStart)/(packetSize))*(packetSize);
|
||||
const Index alignedEnd2 = alignedStart + alignedSize2;
|
||||
|
@ -178,7 +178,7 @@ MatrixBase<Derived>::stableNorm() const
|
||||
if(n==1)
|
||||
return abs(this->coeff(0));
|
||||
|
||||
Index bi = internal::first_aligned(copy);
|
||||
Index bi = internal::first_default_aligned(copy);
|
||||
if (bi>0)
|
||||
internal::stable_norm_kernel(copy.head(bi), ssq, scale, invScale);
|
||||
for (; bi<n; bi+=blockSize)
|
||||
|
@ -125,7 +125,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,C
|
||||
|
||||
// How many coeffs of the result do we have to skip to be aligned.
|
||||
// Here we assume data are at least aligned on the base scalar type.
|
||||
Index alignedStart = internal::first_aligned(res,size);
|
||||
Index alignedStart = internal::first_default_aligned(res,size);
|
||||
Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0;
|
||||
const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
|
||||
|
||||
|
@ -94,7 +94,7 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
|
||||
|
||||
size_t starti = FirstTriangular ? 0 : j+2;
|
||||
size_t endi = FirstTriangular ? j : size;
|
||||
size_t alignedStart = (starti) + internal::first_aligned(&res[starti], endi-starti);
|
||||
size_t alignedStart = (starti) + internal::first_default_aligned(&res[starti], endi-starti);
|
||||
size_t alignedEnd = alignedStart + ((endi-alignedStart)/(PacketSize))*(PacketSize);
|
||||
|
||||
// TODO make sure this product is a real * complex and that the rhs is properly conjugated if needed
|
||||
|
@ -257,6 +257,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
|
||||
Scalar* _res, Index resStride,
|
||||
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
|
||||
{
|
||||
const Index PacketBytes = packet_traits<Scalar>::size*sizeof(Scalar);
|
||||
// strip zeros
|
||||
Index diagSize = (std::min)(_cols,_depth);
|
||||
Index rows = _rows;
|
||||
@ -311,7 +312,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
|
||||
Index ts = (IsLower && actual_k2>=cols) ? 0 : actual_kc;
|
||||
|
||||
Scalar* geb = blockB+ts*ts;
|
||||
geb = geb + internal::first_aligned(geb,EIGEN_MAX_ALIGN_BYTES/sizeof(Scalar));
|
||||
geb = geb + internal::first_aligned<PacketBytes>(geb,PacketBytes/sizeof(Scalar));
|
||||
|
||||
pack_rhs(geb, rhs.getSubMapper(actual_k2,IsLower ? 0 : k2), actual_kc, rs);
|
||||
|
||||
|
@ -230,7 +230,7 @@ class blas_data_mapper {
|
||||
if (size_t(m_data)%sizeof(Scalar)) {
|
||||
return -1;
|
||||
}
|
||||
return internal::first_aligned(m_data, size);
|
||||
return internal::first_default_aligned(m_data, size);
|
||||
}
|
||||
|
||||
protected:
|
||||
|
@ -506,47 +506,56 @@ template<typename T, bool Align> EIGEN_DEVICE_FUNC inline void conditional_align
|
||||
|
||||
/****************************************************************************/
|
||||
|
||||
/** \internal Returns the index of the first element of the array that is well aligned for vectorization.
|
||||
/** \internal Returns the index of the first element of the array that is well aligned with respect to the requested \a Alignment.
|
||||
*
|
||||
* \tparam Alignment requested alignment in Bytes.
|
||||
* \param array the address of the start of the array
|
||||
* \param size the size of the array
|
||||
*
|
||||
* \note If no element of the array is well aligned, the size of the array is returned. Typically,
|
||||
* for example with SSE, "well aligned" means 16-byte-aligned. If vectorization is disabled or if the
|
||||
* \note If no element of the array is well aligned or the requested alignment is not a multiple of a scalar,
|
||||
* the size of the array is returned. For example with SSE, the requested alignment is typically 16-bytes. If
|
||||
* packet size for the given scalar type is 1, then everything is considered well-aligned.
|
||||
*
|
||||
* \note If the scalar type is vectorizable, we rely on the following assumptions: sizeof(Scalar) is a
|
||||
* power of 2, the packet size in bytes is also a power of 2, and is a multiple of sizeof(Scalar). On the
|
||||
* other hand, we do not assume that the array address is a multiple of sizeof(Scalar), as that fails for
|
||||
* \note Otherwise, if the Alignment is larger that the scalar size, we rely on the assumptions that sizeof(Scalar) is a
|
||||
* power of 2. On the other hand, we do not assume that the array address is a multiple of sizeof(Scalar), as that fails for
|
||||
* example with Scalar=double on certain 32-bit platforms, see bug #79.
|
||||
*
|
||||
* There is also the variant first_aligned(const MatrixBase&) defined in DenseCoeffsBase.h.
|
||||
* \sa first_default_aligned()
|
||||
*/
|
||||
template<typename Scalar, typename Index>
|
||||
template<int Alignment, typename Scalar, typename Index>
|
||||
inline Index first_aligned(const Scalar* array, Index size)
|
||||
{
|
||||
static const Index PacketSize = packet_traits<Scalar>::size;
|
||||
static const Index PacketAlignedMask = PacketSize-1;
|
||||
static const Index ScalarSize = sizeof(Scalar);
|
||||
static const Index AlignmentSize = Alignment / ScalarSize;
|
||||
static const Index AlignmentMask = AlignmentSize-1;
|
||||
|
||||
if(PacketSize==1)
|
||||
if(AlignmentSize<=1)
|
||||
{
|
||||
// Either there is no vectorization, or a packet consists of exactly 1 scalar so that all elements
|
||||
// of the array have the same alignment.
|
||||
// Either the requested alignment if smaller than a scalar, or it exactly match a 1 scalar
|
||||
// so that all elements of the array have the same alignment.
|
||||
return 0;
|
||||
}
|
||||
else if(size_t(array) & (sizeof(Scalar)-1))
|
||||
else if( (std::size_t(array) & (sizeof(Scalar)-1)) || (Alignment%ScalarSize)!=0)
|
||||
{
|
||||
// There is vectorization for this scalar type, but the array is not aligned to the size of a single scalar.
|
||||
// The array is not aligned to the size of a single scalar, or the requested alignment is not a multiple of the scalar size.
|
||||
// Consequently, no element of the array is well aligned.
|
||||
return size;
|
||||
}
|
||||
else
|
||||
{
|
||||
return std::min<Index>( (PacketSize - (Index((size_t(array)/sizeof(Scalar))) & PacketAlignedMask))
|
||||
& PacketAlignedMask, size);
|
||||
return std::min<Index>( (AlignmentSize - (Index((std::size_t(array)/sizeof(Scalar))) & AlignmentMask)) & AlignmentMask, size);
|
||||
}
|
||||
}
|
||||
|
||||
/** \internal Returns the index of the first element of the array that is well aligned with respect the largest packet requirement.
|
||||
* \sa first_aligned(Scalar*,Index) and first_default_aligned(DenseBase<Derived>) */
|
||||
template<typename Scalar, typename Index>
|
||||
inline Index first_default_aligned(const Scalar* array, Index size)
|
||||
{
|
||||
return first_aligned<packet_traits<Scalar>::size*sizeof(Scalar)>(array, size);
|
||||
}
|
||||
|
||||
/** \internal Returns the smallest integer multiple of \a base and greater or equal to \a size
|
||||
*/
|
||||
template<typename Index>
|
||||
|
@ -325,7 +325,7 @@ void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x
|
||||
// both vectors are sequentially stored in memory => vectorization
|
||||
enum { Peeling = 2 };
|
||||
|
||||
Index alignedStart = internal::first_aligned(y, size);
|
||||
Index alignedStart = internal::first_default_aligned(y, size);
|
||||
Index alignedEnd = alignedStart + ((size-alignedStart)/PacketSize)*PacketSize;
|
||||
|
||||
const Packet pc = pset1<Packet>(c);
|
||||
@ -343,7 +343,7 @@ void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x
|
||||
Scalar* EIGEN_RESTRICT px = x + alignedStart;
|
||||
Scalar* EIGEN_RESTRICT py = y + alignedStart;
|
||||
|
||||
if(internal::first_aligned(x, size)==alignedStart)
|
||||
if(internal::first_default_aligned(x, size)==alignedStart)
|
||||
{
|
||||
for(Index i=alignedStart; i<alignedEnd; i+=PacketSize)
|
||||
{
|
||||
|
@ -39,9 +39,9 @@ void sparselu_gemm(Index m, Index n, Index d, const Scalar* A, Index lda, const
|
||||
};
|
||||
Index d_end = (d/RK)*RK; // number of columns of A (rows of B) suitable for full register blocking
|
||||
Index n_end = (n/RN)*RN; // number of columns of B-C suitable for processing RN columns at once
|
||||
Index i0 = internal::first_aligned(A,m);
|
||||
Index i0 = internal::first_default_aligned(A,m);
|
||||
|
||||
eigen_internal_assert(((lda%PacketSize)==0) && ((ldc%PacketSize)==0) && (i0==internal::first_aligned(C,m)));
|
||||
eigen_internal_assert(((lda%PacketSize)==0) && ((ldc%PacketSize)==0) && (i0==internal::first_default_aligned(C,m)));
|
||||
|
||||
// handle the non aligned rows of A and C without any optimization:
|
||||
for(Index i=0; i<i0; ++i)
|
||||
|
@ -66,8 +66,8 @@ EIGEN_DONT_INLINE void LU_kernel_bmod<SegSizeAtCompileTime>::run(const Index seg
|
||||
const Index PacketSize = internal::packet_traits<Scalar>::size;
|
||||
Index ldl = internal::first_multiple(nrow, PacketSize);
|
||||
Map<Matrix<Scalar,Dynamic,SegSizeAtCompileTime>, 0, OuterStride<> > B( &(lusup.data()[luptr]), nrow, segsize, OuterStride<>(lda) );
|
||||
Index aligned_offset = internal::first_aligned(tempv.data()+segsize, PacketSize);
|
||||
Index aligned_with_B_offset = (PacketSize-internal::first_aligned(B.data(), PacketSize))%PacketSize;
|
||||
Index aligned_offset = internal::first_default_aligned(tempv.data()+segsize, PacketSize);
|
||||
Index aligned_with_B_offset = (PacketSize-internal::first_default_aligned(B.data(), PacketSize))%PacketSize;
|
||||
Map<Matrix<Scalar,Dynamic,1>, 0, OuterStride<> > l(tempv.data()+segsize+aligned_offset+aligned_with_B_offset, nrow, OuterStride<>(ldl) );
|
||||
|
||||
l.setZero();
|
||||
|
@ -145,7 +145,7 @@ void SparseLUImpl<Scalar,StorageIndex>::panel_bmod(const Index m, const Index w,
|
||||
eigen_assert(tempv.size()>w*ldu + nrow*w + 1);
|
||||
|
||||
Index ldl = internal::first_multiple<Index>(nrow, PacketSize);
|
||||
Index offset = (PacketSize-internal::first_aligned(B.data(), PacketSize)) % PacketSize;
|
||||
Index offset = (PacketSize-internal::first_default_aligned(B.data(), PacketSize)) % PacketSize;
|
||||
Map<Matrix<Scalar,Dynamic,Dynamic>, 0, OuterStride<> > L(tempv.data()+w*ldu+offset, nrow, u_cols, OuterStride<>(ldl));
|
||||
|
||||
L.setZero();
|
||||
|
@ -13,7 +13,7 @@ template<typename Scalar>
|
||||
void test_first_aligned_helper(Scalar *array, int size)
|
||||
{
|
||||
const int packet_size = sizeof(Scalar) * internal::packet_traits<Scalar>::size;
|
||||
VERIFY(((size_t(array) + sizeof(Scalar) * internal::first_aligned(array, size)) % packet_size) == 0);
|
||||
VERIFY(((size_t(array) + sizeof(Scalar) * internal::first_default_aligned(array, size)) % packet_size) == 0);
|
||||
}
|
||||
|
||||
template<typename Scalar>
|
||||
@ -21,7 +21,7 @@ void test_none_aligned_helper(Scalar *array, int size)
|
||||
{
|
||||
EIGEN_UNUSED_VARIABLE(array);
|
||||
EIGEN_UNUSED_VARIABLE(size);
|
||||
VERIFY(internal::packet_traits<Scalar>::size == 1 || internal::first_aligned(array, size) == size);
|
||||
VERIFY(internal::packet_traits<Scalar>::size == 1 || internal::first_default_aligned(array, size) == size);
|
||||
}
|
||||
|
||||
struct some_non_vectorizable_type { float x; };
|
||||
|
Loading…
x
Reference in New Issue
Block a user