mirror of
https://gitlab.com/libeigen/eigen.git
synced 2024-12-15 07:10:37 +08:00
PR 544: Set requestedAlignment correctly for SliceVectorizedTraversals
Commit aa110e681b
optimised the multiplication of small dyanmically
sized matrices by restricting the packet size to a maximum of 4, increasing
the chances that SIMD instructions are used in the computation. However, it
introduced a mismatch between the packet size and the requestedAlignment. This
mismatch can lead to crashes when the destination is not aligned. This patch
fixes the issue by ensuring that the AssignmentTraits are correctly computed
when using a restricted packet size.
* * *
Bind LinearPacketType to MaxPacketSize
This commit applies any packet size limit specified when instantiating
copy_using_evaluator_traits to the LinearPacketType, providing that the
size of the destination is not known at compile time.
* * *
Add unit test for restricted packet assignment
A new unit test is added to check that multiplication of small dynamically
sized matrices works correctly when the packet size is restricted to 4 and
the destination is unaligned.
This commit is contained in:
parent
3dc0845046
commit
670d56441c
@ -24,7 +24,7 @@ namespace internal {
|
||||
|
||||
// copy_using_evaluator_traits is based on assign_traits
|
||||
|
||||
template <typename DstEvaluator, typename SrcEvaluator, typename AssignFunc>
|
||||
template <typename DstEvaluator, typename SrcEvaluator, typename AssignFunc, int MaxPacketSize = -1>
|
||||
struct copy_using_evaluator_traits
|
||||
{
|
||||
typedef typename DstEvaluator::XprType Dst;
|
||||
@ -51,13 +51,15 @@ private:
|
||||
InnerMaxSize = int(Dst::IsVectorAtCompileTime) ? int(Dst::MaxSizeAtCompileTime)
|
||||
: int(DstFlags)&RowMajorBit ? int(Dst::MaxColsAtCompileTime)
|
||||
: int(Dst::MaxRowsAtCompileTime),
|
||||
RestrictedInnerSize = InnerSize == -1 ? MaxPacketSize : InnerSize,
|
||||
RestrictedLinearSize = Dst::SizeAtCompileTime == -1 ? MaxPacketSize : Dst::SizeAtCompileTime,
|
||||
OuterStride = int(outer_stride_at_compile_time<Dst>::ret),
|
||||
MaxSizeAtCompileTime = Dst::SizeAtCompileTime
|
||||
};
|
||||
|
||||
// TODO distinguish between linear traversal and inner-traversals
|
||||
typedef typename find_best_packet<DstScalar,Dst::SizeAtCompileTime>::type LinearPacketType;
|
||||
typedef typename find_best_packet<DstScalar,InnerSize>::type InnerPacketType;
|
||||
typedef typename find_best_packet<DstScalar,RestrictedLinearSize>::type LinearPacketType;
|
||||
typedef typename find_best_packet<DstScalar,RestrictedInnerSize>::type InnerPacketType;
|
||||
|
||||
enum {
|
||||
LinearPacketSize = unpacket_traits<LinearPacketType>::size,
|
||||
@ -711,7 +713,8 @@ protected:
|
||||
public:
|
||||
typedef typename Base::Scalar Scalar;
|
||||
typedef typename Base::DstXprType DstXprType;
|
||||
typedef typename find_best_packet<Scalar, 4>::type PacketType;
|
||||
typedef copy_using_evaluator_traits<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, 4> AssignmentTraits;
|
||||
typedef typename AssignmentTraits::PacketType PacketType;
|
||||
|
||||
EIGEN_DEVICE_FUNC restricted_packet_dense_assignment_kernel(DstEvaluatorTypeT &dst, const SrcEvaluatorTypeT &src, const Functor &func, DstXprType& dstExpr)
|
||||
: Base(dst, src, func, dstExpr)
|
||||
|
@ -90,6 +90,12 @@ namespace Eigen {
|
||||
{
|
||||
call_assignment_no_alias(dst.expression(), src, func);
|
||||
}
|
||||
|
||||
template<typename Dst, template <typename> class StorageBase, typename Src, typename Func>
|
||||
EIGEN_DEVICE_FUNC void call_restricted_packet_assignment(const NoAlias<Dst,StorageBase>& dst, const Src& src, const Func& func)
|
||||
{
|
||||
call_restricted_packet_assignment_no_alias(dst.expression(), src, func);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -496,4 +502,23 @@ EIGEN_DECLARE_TEST(evaluators)
|
||||
VERIFY_IS_EQUAL( get_cost(a*(a+b)), 1);
|
||||
VERIFY_IS_EQUAL( get_cost(a.lazyProduct(a+b)), 15);
|
||||
}
|
||||
|
||||
{
|
||||
// test restricted_packet_assignment with an unaligned destination
|
||||
const size_t M = 2;
|
||||
const size_t K = 2;
|
||||
const size_t N = 5;
|
||||
float *destMem = new float[(M*N) + 1];
|
||||
float *dest = (internal::UIntPtr(destMem)%EIGEN_MAX_ALIGN_BYTES) == 0 ? destMem+1 : destMem;
|
||||
|
||||
const Matrix<float, Dynamic, Dynamic, RowMajor> a = Matrix<float, Dynamic, Dynamic, RowMajor>::Random(M, K);
|
||||
const Matrix<float, Dynamic, Dynamic, RowMajor> b = Matrix<float, Dynamic, Dynamic, RowMajor>::Random(K, N);
|
||||
|
||||
Map<Matrix<float, Dynamic, Dynamic, RowMajor> > z(dest, M, N);;
|
||||
Product<Matrix<float, Dynamic, Dynamic, RowMajor>, Matrix<float, Dynamic, Dynamic, RowMajor>, LazyProduct> tmp(a,b);
|
||||
internal::call_restricted_packet_assignment(z.noalias(), tmp.derived(), internal::assign_op<float, float>());
|
||||
|
||||
VERIFY_IS_APPROX(z, a*b);
|
||||
delete[] destMem;
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user