mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-03-07 18:27:40 +08:00
Fix excessive GEBP register spilling for 32-bit NEON.
Clang does a poor job of optimizing the GEBP microkernel on 32-bit ARM, leading to excessive 16-byte register spills, slowing down basic f32 matrix multiplication by approx 50%. By specializing `gebp_traits`, we can eliminate the register spills. Volatile inline ASM both acts as a barrier to prevent reordering and enforces strict register use. In a simple f32 matrix multiply example, this modification reduces 16-byte spills from 109 instances to zero, leading to a 1.5x speed increase (search for `16-byte Spill` in the assembly in https://godbolt.org/z/chsPbE). This is a replacement of !379. See there for further discussion. Also moved `gebp_traits` specializations for NEON to `Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h` to be alongside other NEON-specific code. Fixes #2138.
This commit is contained in:
parent
56c8b14d87
commit
f85038b7f3
@ -340,7 +340,9 @@ using std::ptrdiff_t;
|
||||
#include "src/Core/ConditionEstimator.h"
|
||||
|
||||
#if defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX)
|
||||
#include "src/Core/arch/AltiVec/MatrixProduct.h"
|
||||
#include "src/Core/arch/AltiVec/MatrixProduct.h"
|
||||
#elif defined EIGEN_VECTORIZE_NEON
|
||||
#include "src/Core/arch/NEON/GeneralBlockPanelKernel.h"
|
||||
#endif
|
||||
|
||||
#include "src/Core/BooleanRedux.h"
|
||||
|
183
Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h
Normal file
183
Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h
Normal file
@ -0,0 +1,183 @@
|
||||
namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
#if EIGEN_ARCH_ARM && EIGEN_COMP_CLANG
|
||||
|
||||
// Clang seems to excessively spill registers in the GEBP kernel on 32-bit arm.
|
||||
// Here we specialize gebp_traits to eliminate these register spills.
|
||||
// See #2138.
|
||||
template<>
|
||||
struct gebp_traits <float,float,false,false,Architecture::NEON,GEBPPacketFull>
|
||||
: gebp_traits<float,float,false,false,Architecture::Generic,GEBPPacketFull>
|
||||
{
|
||||
EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
|
||||
{
|
||||
// This volatile inline ASM both acts as a barrier to prevent reordering,
|
||||
// as well as enforces strict register use.
|
||||
asm volatile(
|
||||
"vmla.f32 %q[r], %q[c], %q[alpha]"
|
||||
: [r] "+w" (r)
|
||||
: [c] "w" (c),
|
||||
[alpha] "w" (alpha)
|
||||
: );
|
||||
}
|
||||
|
||||
template <typename LaneIdType>
|
||||
EIGEN_STRONG_INLINE void madd(const Packet4f& a, const Packet4f& b,
|
||||
Packet4f& c, Packet4f& tmp,
|
||||
const LaneIdType&) const {
|
||||
acc(a, b, c);
|
||||
}
|
||||
|
||||
template <typename LaneIdType>
|
||||
EIGEN_STRONG_INLINE void madd(const Packet4f& a, const QuadPacket<Packet4f>& b,
|
||||
Packet4f& c, Packet4f& tmp,
|
||||
const LaneIdType& lane) const {
|
||||
madd(a, b.get(lane), c, tmp, lane);
|
||||
}
|
||||
};
|
||||
|
||||
#endif // EIGEN_ARCH_ARM && EIGEN_COMP_CLANG
|
||||
|
||||
#if EIGEN_ARCH_ARM64
|
||||
|
||||
template<>
|
||||
struct gebp_traits <float,float,false,false,Architecture::NEON,GEBPPacketFull>
|
||||
: gebp_traits<float,float,false,false,Architecture::Generic,GEBPPacketFull>
|
||||
{
|
||||
typedef float RhsPacket;
|
||||
typedef float32x4_t RhsPacketx4;
|
||||
|
||||
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
|
||||
{
|
||||
dest = *b;
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
|
||||
{
|
||||
dest = vld1q_f32(b);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
|
||||
{
|
||||
dest = *b;
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
|
||||
{}
|
||||
|
||||
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
|
||||
{
|
||||
loadRhs(b,dest);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
|
||||
{
|
||||
c = vfmaq_n_f32(c, a, b);
|
||||
}
|
||||
|
||||
// NOTE: Template parameter inference failed when compiled with Android NDK:
|
||||
// "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
|
||||
|
||||
EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
|
||||
{ madd_helper<0>(a, b, c); }
|
||||
EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const
|
||||
{ madd_helper<1>(a, b, c); }
|
||||
EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const
|
||||
{ madd_helper<2>(a, b, c); }
|
||||
EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const
|
||||
{ madd_helper<3>(a, b, c); }
|
||||
|
||||
private:
|
||||
template<int LaneID>
|
||||
EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
|
||||
{
|
||||
#if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0))
|
||||
// workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
|
||||
// vfmaq_laneq_f32 is implemented through a costly dup
|
||||
if(LaneID==0) asm("fmla %0.4s, %1.4s, %2.s[0]\n" : "+w" (c) : "w" (a), "w" (b) : );
|
||||
else if(LaneID==1) asm("fmla %0.4s, %1.4s, %2.s[1]\n" : "+w" (c) : "w" (a), "w" (b) : );
|
||||
else if(LaneID==2) asm("fmla %0.4s, %1.4s, %2.s[2]\n" : "+w" (c) : "w" (a), "w" (b) : );
|
||||
else if(LaneID==3) asm("fmla %0.4s, %1.4s, %2.s[3]\n" : "+w" (c) : "w" (a), "w" (b) : );
|
||||
#else
|
||||
c = vfmaq_laneq_f32(c, a, b, LaneID);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template<>
|
||||
struct gebp_traits <double,double,false,false,Architecture::NEON>
|
||||
: gebp_traits<double,double,false,false,Architecture::Generic>
|
||||
{
|
||||
typedef double RhsPacket;
|
||||
|
||||
struct RhsPacketx4 {
|
||||
float64x2_t B_0, B_1;
|
||||
};
|
||||
|
||||
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
|
||||
{
|
||||
dest = *b;
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
|
||||
{
|
||||
dest.B_0 = vld1q_f64(b);
|
||||
dest.B_1 = vld1q_f64(b+2);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
|
||||
{
|
||||
loadRhs(b,dest);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
|
||||
{}
|
||||
|
||||
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
|
||||
{
|
||||
loadRhs(b,dest);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
|
||||
{
|
||||
c = vfmaq_n_f64(c, a, b);
|
||||
}
|
||||
|
||||
// NOTE: Template parameter inference failed when compiled with Android NDK:
|
||||
// "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
|
||||
|
||||
EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
|
||||
{ madd_helper<0>(a, b, c); }
|
||||
EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const
|
||||
{ madd_helper<1>(a, b, c); }
|
||||
EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const
|
||||
{ madd_helper<2>(a, b, c); }
|
||||
EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const
|
||||
{ madd_helper<3>(a, b, c); }
|
||||
|
||||
private:
|
||||
template <int LaneID>
|
||||
EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
|
||||
{
|
||||
#if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0))
|
||||
// workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
|
||||
// vfmaq_laneq_f64 is implemented through a costly dup
|
||||
if(LaneID==0) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : );
|
||||
else if(LaneID==1) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : );
|
||||
else if(LaneID==2) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : );
|
||||
else if(LaneID==3) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : );
|
||||
#else
|
||||
if(LaneID==0) c = vfmaq_laneq_f64(c, a, b.B_0, 0);
|
||||
else if(LaneID==1) c = vfmaq_laneq_f64(c, a, b.B_0, 1);
|
||||
else if(LaneID==2) c = vfmaq_laneq_f64(c, a, b.B_1, 0);
|
||||
else if(LaneID==3) c = vfmaq_laneq_f64(c, a, b.B_1, 1);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
#endif // EIGEN_ARCH_ARM64
|
||||
|
||||
} // namespace internal
|
||||
} // namespace Eigen
|
@ -1076,148 +1076,6 @@ protected:
|
||||
|
||||
};
|
||||
|
||||
|
||||
#if EIGEN_ARCH_ARM64 && defined EIGEN_VECTORIZE_NEON
|
||||
|
||||
template<>
|
||||
struct gebp_traits <float, float, false, false,Architecture::NEON,GEBPPacketFull>
|
||||
: gebp_traits<float,float,false,false,Architecture::Generic,GEBPPacketFull>
|
||||
{
|
||||
typedef float RhsPacket;
|
||||
|
||||
typedef float32x4_t RhsPacketx4;
|
||||
|
||||
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
|
||||
{
|
||||
dest = *b;
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
|
||||
{
|
||||
dest = vld1q_f32(b);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
|
||||
{
|
||||
dest = *b;
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
|
||||
{}
|
||||
|
||||
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
|
||||
{
|
||||
loadRhs(b,dest);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
|
||||
{
|
||||
c = vfmaq_n_f32(c, a, b);
|
||||
}
|
||||
|
||||
// NOTE: Template parameter inference failed when compiled with Android NDK:
|
||||
// "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
|
||||
|
||||
EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
|
||||
{ madd_helper<0>(a, b, c); }
|
||||
EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const
|
||||
{ madd_helper<1>(a, b, c); }
|
||||
EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const
|
||||
{ madd_helper<2>(a, b, c); }
|
||||
EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const
|
||||
{ madd_helper<3>(a, b, c); }
|
||||
|
||||
private:
|
||||
template<int LaneID>
|
||||
EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
|
||||
{
|
||||
#if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0))
|
||||
// workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
|
||||
// vfmaq_laneq_f32 is implemented through a costly dup
|
||||
if(LaneID==0) asm("fmla %0.4s, %1.4s, %2.s[0]\n" : "+w" (c) : "w" (a), "w" (b) : );
|
||||
else if(LaneID==1) asm("fmla %0.4s, %1.4s, %2.s[1]\n" : "+w" (c) : "w" (a), "w" (b) : );
|
||||
else if(LaneID==2) asm("fmla %0.4s, %1.4s, %2.s[2]\n" : "+w" (c) : "w" (a), "w" (b) : );
|
||||
else if(LaneID==3) asm("fmla %0.4s, %1.4s, %2.s[3]\n" : "+w" (c) : "w" (a), "w" (b) : );
|
||||
#else
|
||||
c = vfmaq_laneq_f32(c, a, b, LaneID);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template<>
|
||||
struct gebp_traits <double, double, false, false,Architecture::NEON>
|
||||
: gebp_traits<double,double,false,false,Architecture::Generic>
|
||||
{
|
||||
typedef double RhsPacket;
|
||||
|
||||
struct RhsPacketx4 {
|
||||
float64x2_t B_0, B_1;
|
||||
};
|
||||
|
||||
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
|
||||
{
|
||||
dest = *b;
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
|
||||
{
|
||||
dest.B_0 = vld1q_f64(b);
|
||||
dest.B_1 = vld1q_f64(b+2);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
|
||||
{
|
||||
loadRhs(b,dest);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
|
||||
{}
|
||||
|
||||
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
|
||||
{
|
||||
loadRhs(b,dest);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
|
||||
{
|
||||
c = vfmaq_n_f64(c, a, b);
|
||||
}
|
||||
|
||||
// NOTE: Template parameter inference failed when compiled with Android NDK:
|
||||
// "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
|
||||
|
||||
EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
|
||||
{ madd_helper<0>(a, b, c); }
|
||||
EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const
|
||||
{ madd_helper<1>(a, b, c); }
|
||||
EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const
|
||||
{ madd_helper<2>(a, b, c); }
|
||||
EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const
|
||||
{ madd_helper<3>(a, b, c); }
|
||||
|
||||
private:
|
||||
template <int LaneID>
|
||||
EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
|
||||
{
|
||||
#if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0))
|
||||
// workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
|
||||
// vfmaq_laneq_f64 is implemented through a costly dup
|
||||
if(LaneID==0) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : );
|
||||
else if(LaneID==1) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : );
|
||||
else if(LaneID==2) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : );
|
||||
else if(LaneID==3) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : );
|
||||
#else
|
||||
if(LaneID==0) c = vfmaq_laneq_f64(c, a, b.B_0, 0);
|
||||
else if(LaneID==1) c = vfmaq_laneq_f64(c, a, b.B_0, 1);
|
||||
else if(LaneID==2) c = vfmaq_laneq_f64(c, a, b.B_1, 0);
|
||||
else if(LaneID==3) c = vfmaq_laneq_f64(c, a, b.B_1, 1);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
/* optimized General packed Block * packed Panel product kernel
|
||||
*
|
||||
* Mixing type logic: C += A * B
|
||||
|
Loading…
Reference in New Issue
Block a user